/*
 *  linux/fs/buffer.c
 *
 *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
 */

/*
 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
 *
 * Removed a lot of unnecessary code and simplified things now that
 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
 *
 * Speed up hash, lru, and free list operations.  Use gfp() for allocating
 * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
 *
 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
 *
 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
 */

#include <linux/config.h>
#include <linux/kernel.h>
#include <linux/syscalls.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/smp_lock.h>
#include <linux/blkdev.h>
#include <linux/file.h>
#include <linux/quotaops.h>
#include <linux/highmem.h>
#include <linux/module.h>
#include <linux/writeback.h>
#include <linux/hash.h>
#include <linux/suspend.h>
#include <linux/buffer_head.h>
#include <linux/bio.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/bitops.h>

static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
static void invalidate_bh_lrus(void);

#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)

inline void
init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
{
	bh->b_end_io = handler;
	bh->b_private = private;
}

static int sync_buffer(void *word)
{
	struct block_device *bd;
	struct buffer_head *bh
		= container_of(word, struct buffer_head, b_state);

	smp_mb();
	bd = bh->b_bdev;
	if (bd)
		blk_run_address_space(bd->bd_inode->i_mapping);
	io_schedule();
	return 0;
}

void fastcall __lock_buffer(struct buffer_head *bh)
{
	wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
							TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(__lock_buffer);

void fastcall unlock_buffer(struct buffer_head *bh)
{
	clear_buffer_locked(bh);
	smp_mb__after_clear_bit();
	wake_up_bit(&bh->b_state, BH_Lock);
}

/*
 * Block until a buffer comes unlocked.  This doesn't stop it
 * from becoming locked again - you have to lock it yourself
 * if you want to preserve its state.
 */
void __wait_on_buffer(struct buffer_head * bh)
{
	wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
}

static void
__clear_page_buffers(struct page *page)
{
	ClearPagePrivate(page);
	page->private = 0;
	page_cache_release(page);
}

static void buffer_io_error(struct buffer_head *bh)
{
	char b[BDEVNAME_SIZE];

	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
			bdevname(bh->b_bdev, b),
			(unsigned long long)bh->b_blocknr);
}

/*
 * Default synchronous end-of-IO handler..  Just mark it up-to-date and
 * unlock the buffer. This is what ll_rw_block uses too.
 */
/**
 * ll_rw_block需要把缓冲区首部传递到通用块层，这样就增加了bh的引用计数。
 * 那么，在完成了IO传输后，就需要减少引用计数。这个工作是在bh的b_end_io中完成的。
 * b_end_io最终后调用end_buffer_read_sync或者end_buffer_write_sync
 */
void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
{
	if (uptodate) {
		set_buffer_uptodate(bh);
	} else {
		/* This happens, due to failed READA attempts. */
		clear_buffer_uptodate(bh);
	}
	unlock_buffer(bh);
	put_bh(bh);
}

/**
 * ll_rw_block需要把缓冲区首部传递到通用块层，这样就增加了bh的引用计数。
 * 那么，在完成了IO传输后，就需要减少引用计数。这个工作是在bh的b_end_io中完成的。
 * b_end_io最终后调用end_buffer_read_sync或者end_buffer_write_sync
 */
void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
{
	char b[BDEVNAME_SIZE];

	if (uptodate) {
		set_buffer_uptodate(bh);
	} else {
		if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
			buffer_io_error(bh);
			printk(KERN_WARNING "lost page write due to "
					"I/O error on %s\n",
				       bdevname(bh->b_bdev, b));
		}
		set_buffer_write_io_error(bh);
		clear_buffer_uptodate(bh);
	}
	unlock_buffer(bh);
	put_bh(bh);
}

/*
 * Write out and wait upon all the dirty data associated with a block
 * device via its mapping.  Does not take the superblock lock.
 */
int sync_blockdev(struct block_device *bdev)
{
	int ret = 0;

	if (bdev) {
		int err;

		ret = filemap_fdatawrite(bdev->bd_inode->i_mapping);
		err = filemap_fdatawait(bdev->bd_inode->i_mapping);
		if (!ret)
			ret = err;
	}
	return ret;
}
EXPORT_SYMBOL(sync_blockdev);

/*
 * Write out and wait upon all dirty data associated with this
 * superblock.  Filesystem data as well as the underlying block
 * device.  Takes the superblock lock.
 */
int fsync_super(struct super_block *sb)
{
	sync_inodes_sb(sb, 0);
	DQUOT_SYNC(sb);
	lock_super(sb);
	if (sb->s_dirt && sb->s_op->write_super)
		sb->s_op->write_super(sb);
	unlock_super(sb);
	if (sb->s_op->sync_fs)
		sb->s_op->sync_fs(sb, 1);
	sync_blockdev(sb->s_bdev);
	sync_inodes_sb(sb, 1);

	return sync_blockdev(sb->s_bdev);
}

/*
 * Write out and wait upon all dirty data associated with this
 * device.   Filesystem data as well as the underlying block
 * device.  Takes the superblock lock.
 */
int fsync_bdev(struct block_device *bdev)
{
	struct super_block *sb = get_super(bdev);
	if (sb) {
		int res = fsync_super(sb);
		drop_super(sb);
		return res;
	}
	return sync_blockdev(bdev);
}

/**
 * freeze_bdev  --  lock a filesystem and force it into a consistent state
 * @bdev:	blockdevice to lock
 *
 * This takes the block device bd_mount_sem to make sure no new mounts
 * happen on bdev until thaw_bdev() is called.
 * If a superblock is found on this device, we take the s_umount semaphore
 * on it to make sure nobody unmounts until the snapshot creation is done.
 */
struct super_block *freeze_bdev(struct block_device *bdev)
{
	struct super_block *sb;

	down(&bdev->bd_mount_sem);
	sb = get_super(bdev);
	if (sb && !(sb->s_flags & MS_RDONLY)) {
		sb->s_frozen = SB_FREEZE_WRITE;
		wmb();

		sync_inodes_sb(sb, 0);
		DQUOT_SYNC(sb);

		lock_super(sb);
		if (sb->s_dirt && sb->s_op->write_super)
			sb->s_op->write_super(sb);
		unlock_super(sb);

		if (sb->s_op->sync_fs)
			sb->s_op->sync_fs(sb, 1);

		sync_blockdev(sb->s_bdev);
		sync_inodes_sb(sb, 1);

		sb->s_frozen = SB_FREEZE_TRANS;
		wmb();

		sync_blockdev(sb->s_bdev);

		if (sb->s_op->write_super_lockfs)
			sb->s_op->write_super_lockfs(sb);
	}

	sync_blockdev(bdev);
	return sb;	/* thaw_bdev releases s->s_umount and bd_mount_sem */
}
EXPORT_SYMBOL(freeze_bdev);

/**
 * thaw_bdev  -- unlock filesystem
 * @bdev:	blockdevice to unlock
 * @sb:		associated superblock
 *
 * Unlocks the filesystem and marks it writeable again after freeze_bdev().
 */
void thaw_bdev(struct block_device *bdev, struct super_block *sb)
{
	if (sb) {
		BUG_ON(sb->s_bdev != bdev);

		if (sb->s_op->unlockfs)
			sb->s_op->unlockfs(sb);
		sb->s_frozen = SB_UNFROZEN;
		wmb();
		wake_up(&sb->s_wait_unfrozen);
		drop_super(sb);
	}

	up(&bdev->bd_mount_sem);
}
EXPORT_SYMBOL(thaw_bdev);

/*
 * sync everything.  Start out by waking pdflush, because that writes back
 * all queues in parallel.
 */
static void do_sync(unsigned long wait)
{
	/**
	 * 启动pdflush内核线程。将所有脏页写入到磁盘。
	 */
	wakeup_bdflush(0);
	/**
	 * 扫描超级块的链表，以搜索要刷新的脏索引节点。
	 */
	sync_inodes(0);		/* All mappings, inodes and their blockdevs */
	DQUOT_SYNC(NULL);
	/**
	 * 把脏超级块写到磁盘。
	 */
	sync_supers();		/* Write the superblocks */
	/**
	 * 为所有可写的文件系统执行sys_fs超级块方法。
	 * 这是文件系统的一个钩子，象ext3这样的日志文件系统使用这个方法。
	 */
	sync_filesystems(0);	/* Start syncing the filesystems */
	/**
	 * sync_filesystems和sync_inodes被再次调用。
	 * 这两个函数的参数有了变化。
	 */
	sync_filesystems(wait);	/* Waitingly sync the filesystems */
	sync_inodes(wait);	/* Mappings, inodes and blockdevs, again. */
	if (!wait)
		printk("Emergency Sync complete\n");
	if (unlikely(laptop_mode))
		laptop_sync_completion();
}

/**
 * sync系统调用的实现函数。
 */
asmlinkage long sys_sync(void)
{
	do_sync(1);
	return 0;
}

void emergency_sync(void)
{
	pdflush_operation(do_sync, 0);
}

/*
 * Generic function to fsync a file.
 *
 * filp may be NULL if called via the msync of a vma.
 */
 
int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
{
	struct inode * inode = dentry->d_inode;
	struct super_block * sb;
	int ret, err;

	/* sync the inode to buffers */
	ret = write_inode_now(inode, 0);

	/* sync the superblock to buffers */
	sb = inode->i_sb;
	lock_super(sb);
	if (sb->s_op->write_super)
		sb->s_op->write_super(sb);
	unlock_super(sb);

	/* .. finally sync the buffers to disk */
	err = sync_blockdev(sb->s_bdev);
	if (!ret)
		ret = err;
	return ret;
}

/**
 * 系统调用fsync的实现。
 * 将fd对应的所有脏缓冲区写到磁盘中，如果需要，还包括存有索引节点的缓冲区。
 */
asmlinkage long sys_fsync(unsigned int fd)
{
	struct file * file;
	struct address_space *mapping;
	int ret, err;

	ret = -EBADF;
	/**
	 * 获得文件对象的地址。
	 */
	file = fget(fd);
	if (!file)
		goto out;

	mapping = file->f_mapping;

	ret = -EINVAL;
	if (!file->f_op || !file->f_op->fsync) {
		/* Why?  We can still call filemap_fdatawrite */
		goto out_putf;
	}

	current->flags |= PF_SYNCWRITE;
	ret = filemap_fdatawrite(mapping);

	/*
	 * We need to protect against concurrent writers,
	 * which could cause livelocks in fsync_buffers_list
	 */
	down(&mapping->host->i_sem);
	/**
	 * 调用文件对象的fsync方法进行数据同步。
	 * 该回调函数通常是__writeback_single_inode。
	 */
	err = file->f_op->fsync(file, file->f_dentry, 0);
	if (!ret)
		ret = err;
	up(&mapping->host->i_sem);
	err = filemap_fdatawait(mapping);
	if (!ret)
		ret = err;
	current->flags &= ~PF_SYNCWRITE;

out_putf:
	fput(file);
out:
	return ret;
}

asmlinkage long sys_fdatasync(unsigned int fd)
{
	struct file * file;
	struct address_space *mapping;
	int ret, err;

	ret = -EBADF;
	file = fget(fd);
	if (!file)
		goto out;

	ret = -EINVAL;
	if (!file->f_op || !file->f_op->fsync)
		goto out_putf;

	mapping = file->f_mapping;

	current->flags |= PF_SYNCWRITE;
	ret = filemap_fdatawrite(mapping);
	down(&mapping->host->i_sem);
	err = file->f_op->fsync(file, file->f_dentry, 1);
	if (!ret)
		ret = err;
	up(&mapping->host->i_sem);
	err = filemap_fdatawait(mapping);
	if (!ret)
		ret = err;
	current->flags &= ~PF_SYNCWRITE;

out_putf:
	fput(file);
out:
	return ret;
}

/*
 * Various filesystems appear to want __find_get_block to be non-blocking.
 * But it's the page lock which protects the buffers.  To get around this,
 * we get exclusion from try_to_free_buffers with the blockdev mapping's
 * private_lock.
 *
 * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
 * may be quite high.  This code could TryLock the page, and if that
 * succeeds, there is no need to take private_lock. (But if
 * private_lock is contended then so is mapping->tree_lock).
 */
/**
 * 在高速缓存中搜索缓冲区首部。
 */
static struct buffer_head *
__find_get_block_slow(struct block_device *bdev, sector_t block, int unused)
{
	struct inode *bd_inode = bdev->bd_inode;
	struct address_space *bd_mapping = bd_inode->i_mapping;
	struct buffer_head *ret = NULL;
	pgoff_t index;
	struct buffer_head *bh;
	struct buffer_head *head;
	struct page *page;
	int all_mapped = 1;

	/**
	 * 根据块号和块大小得到与块设备相关的页索引。
	 */
	index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
	/**
	 * 调用find_get_page确定请求的块缓冲区页的会在页高速缓存中的位置。
	 */
	page = find_get_page(bd_mapping, index);

	/**
	 * 页没有在高速缓存中，块也当然不在高速缓存中。
	 */
	if (!page)
		goto out;

	spin_lock(&bd_mapping->private_lock);
	if (!page_has_buffers(page))
		goto out_unlock;
	head = page_buffers(page);
	bh = head;
	/**
	 * 在页的缓冲区首部链表中搜索逻辑块号等于block的块。
	 */
	do {
		if (bh->b_blocknr == block) {
			ret = bh;
			get_bh(bh);
			goto out_unlock;
		}
		if (!buffer_mapped(bh))
			all_mapped = 0;
		bh = bh->b_this_page;
	} while (bh != head);

	/* we might be here because some of the buffers on this page are
	 * not mapped.  This is due to various races between
	 * file io on the block device and getblk.  It gets dealt with
	 * elsewhere, don't buffer_error if we had some unmapped buffers
	 */
	if (all_mapped) {
		printk("__find_get_block_slow() failed. "
			"block=%llu, b_blocknr=%llu\n",
			(unsigned long long)block, (unsigned long long)bh->b_blocknr);
		printk("b_state=0x%08lx, b_size=%u\n", bh->b_state, bh->b_size);
		printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
	}
out_unlock:
	spin_unlock(&bd_mapping->private_lock);
	/**
	 * 递减描述符的count字段(find_get_page曾递增它的值)。
	 */
	page_cache_release(page);
out:
	return ret;
}

/* If invalidate_buffers() will trash dirty buffers, it means some kind
   of fs corruption is going on. Trashing dirty data always imply losing
   information that was supposed to be just stored on the physical layer
   by the user.

   Thus invalidate_buffers in general usage is not allwowed to trash
   dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
   be preserved.  These buffers are simply skipped.
  
   We also skip buffers which are still in use.  For example this can
   happen if a userspace program is reading the block device.

   NOTE: In the case where the user removed a removable-media-disk even if
   there's still dirty data not synced on disk (due a bug in the device driver
   or due an error of the user), by not destroying the dirty buffers we could
   generate corruption also on the next media inserted, thus a parameter is
   necessary to handle this case in the most safe way possible (trying
   to not corrupt also the new disk inserted with the data belonging to
   the old now corrupted disk). Also for the ramdisk the natural thing
   to do in order to release the ramdisk memory is to destroy dirty buffers.

   These are two special cases. Normal usage imply the device driver
   to issue a sync on the device (without waiting I/O completion) and
   then an invalidate_buffers call that doesn't trash dirty buffers.

   For handling cache coherency with the blkdev pagecache the 'update' case
   is been introduced. It is needed to re-read from disk any pinned
   buffer. NOTE: re-reading from disk is destructive so we can do it only
   when we assume nobody is changing the buffercache under our I/O and when
   we think the disk contains more recent information than the buffercache.
   The update == 1 pass marks the buffers we need to update, the update == 2
   pass does the actual I/O. */
void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
{
	invalidate_bh_lrus();
	/*
	 * FIXME: what about destroy_dirty_buffers?
	 * We really want to use invalidate_inode_pages2() for
	 * that, but not until that's cleaned up.
	 */
	invalidate_inode_pages(bdev->bd_inode->i_mapping);
}

/*
 * Kick pdflush then try to free up some ZONE_NORMAL memory.
 */
/**
 * 内存紧缺回收。在分配VFS缓冲区或缓冲区首部时，内核调用此函数。
 */
static void free_more_memory(void)
{
	struct zone **zones;
	pg_data_t *pgdat;

	/**
	 * 唤醒pdflush，并触发页高速缓存中1024个脏页的写操作。
	 * 写脏页到磁盘的操作将最终使包含缓冲区、缓冲区首部和其他VFS数据结构的页框成为可释放的。
	 */
	wakeup_bdflush(1024);
	/**
	 * 调用yield使pdflush内核线程能够有机会得到执行。
	 */
	yield();

	/**
	 * 对系统的所有内存节点，启动一个循环。
	 */
	for_each_pgdat(pgdat) {
		zones = pgdat->node_zonelists[GFP_NOFS&GFP_ZONEMASK].zones;
		/**
		 * 对每个节点，调用try_to_free_pages，将紧缺内存管理区链表作为参数。
		 */
		if (*zones)
			try_to_free_pages(zones, GFP_NOFS, 0);
	}
}

/*
 * I/O completion handler for block_read_full_page() - pages
 * which come unlocked at the end of I/O.
 */
/**
 * 是缓冲区首部的完成方法。对块缓冲区的IO数据传输一结束，它就执行。
 * 它是block_read_full_page的I/O完成处理函数。
 */
static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
{
	static DEFINE_SPINLOCK(page_uptodate_lock);
	unsigned long flags;
	struct buffer_head *tmp;
	struct page *page;
	int page_uptodate = 1;

	/* 在block_read_full_page中应当设置了此标志，在此确认 */
	BUG_ON(!buffer_async_read(bh));

	page = bh->b_page;
	if (uptodate) {/* IO执行成功，设置uptodate标志 */
		set_buffer_uptodate(bh);
	} else {
		/* 否则设置页面错误标志 */
		clear_buffer_uptodate(bh);
		if (printk_ratelimit())
			buffer_io_error(bh);
		SetPageError(page);
	}

	/*
	 * Be _very_ careful from here on. Bad things can happen if
	 * two buffer heads end IO at almost the same time and both
	 * decide that the page is now completely done.
	 */
	spin_lock_irqsave(&page_uptodate_lock, flags);
	clear_buffer_async_read(bh);
	unlock_buffer(bh);
	tmp = bh;
	do {/* 循环检查页面中的所有缓冲区首部 */
		if (!buffer_uptodate(tmp))/* 某个缓冲区首部还没有更新，则清除页面uptodate标志 */
			page_uptodate = 0;
		if (buffer_async_read(tmp)) {/* 某个缓冲区还没有完成，仍然存在BH_Async_Read标志 */
			BUG_ON(!buffer_locked(tmp));
			goto still_busy;/* 整个页面还没有完成，退出，待所有缓冲区首部都完成后再继续 */
		}
		tmp = tmp->b_this_page;
	} while (tmp != bh);
	spin_unlock_irqrestore(&page_uptodate_lock, flags);

	/*
	 * If none of the buffers had errors and they are all
	 * uptodate then we can set the page uptodate.
	 */
	if (page_uptodate && !PageError(page))/* 整个页面都完成了，并且没有错误 */
		SetPageUptodate(page);/* 设置页面更新标志 */
	/* 解锁页面 */
	unlock_page(page);
	return;

still_busy:
	spin_unlock_irqrestore(&page_uptodate_lock, flags);
	return;
}

/*
 * Completion handler for block_write_full_page() - pages which are unlocked
 * during I/O, and which have PageWriteback cleared upon I/O completion.
 */
/* 块缓冲区被成功写入磁盘后,回调此函数 */
void end_buffer_async_write(struct buffer_head *bh, int uptodate)
{
	char b[BDEVNAME_SIZE];
	static DEFINE_SPINLOCK(page_uptodate_lock);
	unsigned long flags;
	struct buffer_head *tmp;
	struct page *page;

	/* 在block_write_full_page函数中，为脏块设置了此标志，这里检测标志，避免内存方面的问题 */
	BUG_ON(!buffer_async_write(bh));

	page = bh->b_page;
	if (uptodate) {/* 成功回写，设置uptodate标志表示缓冲区已经更新 */
		set_buffer_uptodate(bh);
	} else {
		if (printk_ratelimit()) {/* 打印提示信息 */
			buffer_io_error(bh);
			printk(KERN_WARNING "lost page write due to "
					"I/O error on %s\n",
			       bdevname(bh->b_bdev, b));
		}
		/* 设置错误标志，清除uptodate标志 */
		set_bit(AS_EIO, &page->mapping->flags);
		clear_buffer_uptodate(bh);
		SetPageError(page);
	}

	spin_lock_irqsave(&page_uptodate_lock, flags);
	/* 清除此标志，表示写已经完成 */
	clear_buffer_async_write(bh);
	unlock_buffer(bh);
	tmp = bh->b_this_page;
	/* 遍历缓冲区,并检查其脏标志 */
	while (tmp != bh) {
		if (buffer_async_write(tmp)) {/* 如果某个缓冲区块还没有写完,则表示页面未完全完成 */
			BUG_ON(!buffer_locked(tmp));
			goto still_busy;
		}
		tmp = tmp->b_this_page;
	}
	spin_unlock_irqrestore(&page_uptodate_lock, flags);
	/* 页面已经全部回写,设置页面的标志 */
	end_page_writeback(page);
	return;

still_busy:
	spin_unlock_irqrestore(&page_uptodate_lock, flags);
	return;
}

/*
 * If a page's buffers are under async readin (end_buffer_async_read
 * completion) then there is a possibility that another thread of
 * control could lock one of the buffers after it has completed
 * but while some of the other buffers have not completed.  This
 * locked buffer would confuse end_buffer_async_read() into not unlocking
 * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
 * that this buffer is not under async I/O.
 *
 * The page comes unlocked when it has no locked buffer_async buffers
 * left.
 *
 * PageLocked prevents anyone starting new async I/O reads any of
 * the buffers.
 *
 * PageWriteback is used to prevent simultaneous writeout of the same
 * page.
 *
 * PageLocked prevents anyone from starting writeback of a page which is
 * under read I/O (PageWriteback is only ever set against a locked page).
 */
static void mark_buffer_async_read(struct buffer_head *bh)
{
	bh->b_end_io = end_buffer_async_read;
	set_buffer_async_read(bh);
}

void mark_buffer_async_write(struct buffer_head *bh)
{
	bh->b_end_io = end_buffer_async_write;
	set_buffer_async_write(bh);
}
EXPORT_SYMBOL(mark_buffer_async_write);


/*
 * fs/buffer.c contains helper functions for buffer-backed address space's
 * fsync functions.  A common requirement for buffer-based filesystems is
 * that certain data from the backing blockdev needs to be written out for
 * a successful fsync().  For example, ext2 indirect blocks need to be
 * written back and waited upon before fsync() returns.
 *
 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
 * management of a list of dependent buffers at ->i_mapping->private_list.
 *
 * Locking is a little subtle: try_to_free_buffers() will remove buffers
 * from their controlling inode's queue when they are being freed.  But
 * try_to_free_buffers() will be operating against the *blockdev* mapping
 * at the time, not against the S_ISREG file which depends on those buffers.
 * So the locking for private_list is via the private_lock in the address_space
 * which backs the buffers.  Which is different from the address_space 
 * against which the buffers are listed.  So for a particular address_space,
 * mapping->private_lock does *not* protect mapping->private_list!  In fact,
 * mapping->private_list will always be protected by the backing blockdev's
 * ->private_lock.
 *
 * Which introduces a requirement: all buffers on an address_space's
 * ->private_list must be from the same address_space: the blockdev's.
 *
 * address_spaces which do not place buffers at ->private_list via these
 * utility functions are free to use private_lock and private_list for
 * whatever they want.  The only requirement is that list_empty(private_list)
 * be true at clear_inode() time.
 *
 * FIXME: clear_inode should not call invalidate_inode_buffers().  The
 * filesystems should do that.  invalidate_inode_buffers() should just go
 * BUG_ON(!list_empty).
 *
 * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
 * take an address_space, not an inode.  And it should be called
 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
 * queued up.
 *
 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
 * list if it is already on a list.  Because if the buffer is on a list,
 * it *must* already be on the right one.  If not, the filesystem is being
 * silly.  This will save a ton of locking.  But first we have to ensure
 * that buffers are taken *off* the old inode's list when they are freed
 * (presumably in truncate).  That requires careful auditing of all
 * filesystems (do it inside bforget()).  It could also be done by bringing
 * b_inode back.
 */

/*
 * The buffer's backing address_space's private_lock must be held
 */
static inline void __remove_assoc_queue(struct buffer_head *bh)
{
	list_del_init(&bh->b_assoc_buffers);
}

int inode_has_buffers(struct inode *inode)
{
	return !list_empty(&inode->i_data.private_list);
}

/*
 * osync is designed to support O_SYNC io.  It waits synchronously for
 * all already-submitted IO to complete, but does not queue any new
 * writes to the disk.
 *
 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
 * you dirty the buffers, and then use osync_inode_buffers to wait for
 * completion.  Any other dirty buffers which are not yet queued for
 * write will not be flushed to disk by the osync.
 */
static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
{
	struct buffer_head *bh;
	struct list_head *p;
	int err = 0;

	spin_lock(lock);
repeat:
	list_for_each_prev(p, list) {
		bh = BH_ENTRY(p);
		if (buffer_locked(bh)) {
			get_bh(bh);
			spin_unlock(lock);
			wait_on_buffer(bh);
			if (!buffer_uptodate(bh))
				err = -EIO;
			brelse(bh);
			spin_lock(lock);
			goto repeat;
		}
	}
	spin_unlock(lock);
	return err;
}

/**
 * sync_mapping_buffers - write out and wait upon a mapping's "associated"
 *                        buffers
 * @buffer_mapping - the mapping which backs the buffers' data
 * @mapping - the mapping which wants those buffers written
 *
 * Starts I/O against the buffers at mapping->private_list, and waits upon
 * that I/O.
 *
 * Basically, this is a convenience function for fsync().  @buffer_mapping is
 * the blockdev which "owns" the buffers and @mapping is a file or directory
 * which needs those buffers to be written for a successful fsync().
 */
/* 将缓冲区同步到磁盘，并等待其结束 */
int sync_mapping_buffers(struct address_space *mapping)
{
	/* 获得和文件inode地址空间相关联的地址空间，即块设备inode的地址空间 */
	struct address_space *buffer_mapping = mapping->assoc_mapping;

	/* 确保块设备地址空间不为空，并且其private_list是块设备的同步元数据不为空(表示有元数据需要同步) */
	if (buffer_mapping == NULL || list_empty(&mapping->private_list))
		return 0;

	/* 将块设备的元数据同步到磁盘 */
	return fsync_buffers_list(&buffer_mapping->private_lock,
					&mapping->private_list);
}
EXPORT_SYMBOL(sync_mapping_buffers);

/*
 * Called when we've recently written block `bblock', and it is known that
 * `bblock' was for a buffer_boundary() buffer.  This means that the block at
 * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
 * dirty, schedule it for IO.  So that indirects merge nicely with their data.
 */
void write_boundary_block(struct block_device *bdev,
			sector_t bblock, unsigned blocksize)
{
	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
	if (bh) {
		if (buffer_dirty(bh))
			ll_rw_block(WRITE, 1, &bh);
		put_bh(bh);
	}
}

void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
{
	struct address_space *mapping = inode->i_mapping;
	struct address_space *buffer_mapping = bh->b_page->mapping;

	mark_buffer_dirty(bh);
	if (!mapping->assoc_mapping) {
		mapping->assoc_mapping = buffer_mapping;
	} else {
		if (mapping->assoc_mapping != buffer_mapping)
			BUG();
	}
	if (list_empty(&bh->b_assoc_buffers)) {
		spin_lock(&buffer_mapping->private_lock);
		list_move_tail(&bh->b_assoc_buffers,
				&mapping->private_list);
		spin_unlock(&buffer_mapping->private_lock);
	}
}
EXPORT_SYMBOL(mark_buffer_dirty_inode);

/*
 * Add a page to the dirty page list.
 *
 * It is a sad fact of life that this function is called from several places
 * deeply under spinlocking.  It may not sleep.
 *
 * If the page has buffers, the uptodate buffers are set dirty, to preserve
 * dirty-state coherency between the page and the buffers.  It the page does
 * not have buffers then when they are later attached they will all be set
 * dirty.
 *
 * The buffers are dirtied before the page is dirtied.  There's a small race
 * window in which a writepage caller may see the page cleanness but not the
 * buffer dirtiness.  That's fine.  If this code were to set the page dirty
 * before the buffers, a concurrent writepage caller could clear the page dirty
 * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
 * page on the dirty page list.
 *
 * We use private_lock to lock against try_to_free_buffers while using the
 * page's buffer list.  Also use this to protect against clean buffers being
 * added to the page after it was set dirty.
 *
 * FIXME: may need to call ->reservepage here as well.  That's rather up to the
 * address_space though.
 */
int __set_page_dirty_buffers(struct page *page)
{
	struct address_space * const mapping = page->mapping;

	spin_lock(&mapping->private_lock);
	if (page_has_buffers(page)) {
		struct buffer_head *head = page_buffers(page);
		struct buffer_head *bh = head;

		do {
			set_buffer_dirty(bh);
			bh = bh->b_this_page;
		} while (bh != head);
	}
	spin_unlock(&mapping->private_lock);

	if (!TestSetPageDirty(page)) {
		spin_lock_irq(&mapping->tree_lock);
		if (page->mapping) {	/* Race with truncate? */
			if (!mapping->backing_dev_info->memory_backed)
				inc_page_state(nr_dirty);
			radix_tree_tag_set(&mapping->page_tree,
						page_index(page),
						PAGECACHE_TAG_DIRTY);
		}
		spin_unlock_irq(&mapping->tree_lock);
		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
	}
	
	return 0;
}
EXPORT_SYMBOL(__set_page_dirty_buffers);

/*
 * Write out and wait upon a list of buffers.
 *
 * We have conflicting pressures: we want to make sure that all
 * initially dirty buffers get waited on, but that any subsequently
 * dirtied buffers don't.  After all, we don't want fsync to last
 * forever if somebody is actively writing to the file.
 *
 * Do this in two main stages: first we copy dirty buffers to a
 * temporary inode list, queueing the writes as we go.  Then we clean
 * up, waiting for those writes to complete.
 * 
 * During this second stage, any subsequent updates to the file may end
 * up refiling the buffer on the original inode's dirty list again, so
 * there is a chance we will end up with a buffer queued for write but
 * not yet completed on that list.  So, as a final cleanup we go through
 * the osync code to catch these locked, dirty buffers without requeuing
 * any newly dirty buffers for write.
 */
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
{
	struct buffer_head *bh;
	struct list_head tmp;
	int err = 0, err2;

	INIT_LIST_HEAD(&tmp);

	spin_lock(lock);
	while (!list_empty(list)) {
		bh = BH_ENTRY(list->next);
		list_del_init(&bh->b_assoc_buffers);
		if (buffer_dirty(bh) || buffer_locked(bh)) {
			list_add(&bh->b_assoc_buffers, &tmp);
			if (buffer_dirty(bh)) {
				get_bh(bh);
				spin_unlock(lock);
				/*
				 * Ensure any pending I/O completes so that
				 * ll_rw_block() actually writes the current
				 * contents - it is a noop if I/O is still in
				 * flight on potentially older contents.
				 */
				wait_on_buffer(bh);
				ll_rw_block(WRITE, 1, &bh);
				brelse(bh);
				spin_lock(lock);
			}
		}
	}

	while (!list_empty(&tmp)) {
		bh = BH_ENTRY(tmp.prev);
		__remove_assoc_queue(bh);
		get_bh(bh);
		spin_unlock(lock);
		wait_on_buffer(bh);
		if (!buffer_uptodate(bh))
			err = -EIO;
		brelse(bh);
		spin_lock(lock);
	}
	
	spin_unlock(lock);
	err2 = osync_buffers_list(lock, list);
	if (err)
		return err;
	else
		return err2;
}

/*
 * Invalidate any and all dirty buffers on a given inode.  We are
 * probably unmounting the fs, but that doesn't mean we have already
 * done a sync().  Just drop the buffers from the inode list.
 *
 * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
 * assumes that all the buffers are against the blockdev.  Not true
 * for reiserfs.
 */
void invalidate_inode_buffers(struct inode *inode)
{
	if (inode_has_buffers(inode)) {
		struct address_space *mapping = &inode->i_data;
		struct list_head *list = &mapping->private_list;
		struct address_space *buffer_mapping = mapping->assoc_mapping;

		spin_lock(&buffer_mapping->private_lock);
		while (!list_empty(list))
			__remove_assoc_queue(BH_ENTRY(list->next));
		spin_unlock(&buffer_mapping->private_lock);
	}
}

/*
 * Remove any clean buffers from the inode's buffer list.  This is called
 * when we're trying to free the inode itself.  Those buffers can pin it.
 *
 * Returns true if all buffers were removed.
 */
int remove_inode_buffers(struct inode *inode)
{
	int ret = 1;

	if (inode_has_buffers(inode)) {
		struct address_space *mapping = &inode->i_data;
		struct list_head *list = &mapping->private_list;
		struct address_space *buffer_mapping = mapping->assoc_mapping;

		spin_lock(&buffer_mapping->private_lock);
		while (!list_empty(list)) {
			struct buffer_head *bh = BH_ENTRY(list->next);
			if (buffer_dirty(bh)) {
				ret = 0;
				break;
			}
			__remove_assoc_queue(bh);
		}
		spin_unlock(&buffer_mapping->private_lock);
	}
	return ret;
}

/*
 * Create the appropriate buffers when given a page for data area and
 * the size of each buffer.. Use the bh->b_this_page linked list to
 * follow the buffers created.  Return NULL if unable to create more
 * buffers.
 *
 * The retry flag is used to differentiate async IO (paging, swapping)
 * which may not fail from ordinary buffer allocations.
 */
/**
 * 根据页中所请求的块大小分配缓冲区首部，并把它们插入由b_this_page字段实现的单向循环链表
 * 此外，函数用页描述符的地址初始化缓冲区首部的b_page字段。用块缓冲区在页内的线性地址或偏移量初始化b_data字段
 */
struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
		int retry)
{
	struct buffer_head *bh, *head;
	long offset;

try_again:
	head = NULL;
	offset = PAGE_SIZE;
	while ((offset -= size) >= 0) {
		bh = alloc_buffer_head(GFP_NOFS);
		if (!bh)
			goto no_grow;

		bh->b_bdev = NULL;
		bh->b_this_page = head;
		bh->b_blocknr = -1;
		head = bh;

		bh->b_state = 0;
		atomic_set(&bh->b_count, 0);
		bh->b_size = size;

		/* Link the buffer to its page */
		set_bh_page(bh, page, offset);

		bh->b_end_io = NULL;
	}
	return head;
/*
 * In case anything failed, we just free everything we got.
 */
no_grow:
	if (head) {
		do {
			bh = head;
			head = head->b_this_page;
			free_buffer_head(bh);
		} while (head);
	}

	/*
	 * Return failure for non-async IO requests.  Async IO requests
	 * are not allowed to fail, so we have to wait until buffer heads
	 * become available.  But we don't want tasks sleeping with 
	 * partially complete buffers, so all were released above.
	 */
	if (!retry)
		return NULL;

	/* We're _really_ low on memory. Now we just
	 * wait for old buffer heads to become free due to
	 * finishing IO.  Since this is an async request and
	 * the reserve list is empty, we're sure there are 
	 * async buffer heads in use.
	 */
	free_more_memory();
	goto try_again;
}
EXPORT_SYMBOL_GPL(alloc_page_buffers);

static inline void
link_dev_buffers(struct page *page, struct buffer_head *head)
{
	struct buffer_head *bh, *tail;

	bh = head;
	do {
		tail = bh;
		bh = bh->b_this_page;
	} while (bh);
	tail->b_this_page = head;
	attach_page_buffers(page, head);
}

/*
 * Initialise the state of a blockdev page's buffers.
 */ 
static void
init_page_buffers(struct page *page, struct block_device *bdev,
			sector_t block, int size)
{
	struct buffer_head *head = page_buffers(page);
	struct buffer_head *bh = head;
	int uptodate = PageUptodate(page);

	do {
		if (!buffer_mapped(bh)) {
			init_buffer(bh, NULL, NULL);
			bh->b_bdev = bdev;
			bh->b_blocknr = block;
			if (uptodate)
				set_buffer_uptodate(bh);
			set_buffer_mapped(bh);
		}
		block++;
		bh = bh->b_this_page;
	} while (bh != head);
}

/*
 * Create the page-cache page that contains the requested block.
 *
 * This is user purely for blockdev mappings.
 */
/**
 * 创建新的块设备缓冲区页。
 */
static struct page *
grow_dev_page(struct block_device *bdev, sector_t block,
		pgoff_t index, int size)
{
	struct inode *inode = bdev->bd_inode;
	struct page *page;
	struct buffer_head *bh;

	/**
	 * 调用find_or_create_page，传递的参数是块设备的address_space对象、页偏移index以及GFP_NOFS标志。
	 * 该函数在页调整缓存中搜索需要的页，如果需要，就把新页插入页调整缓存。
	 */
	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
	if (!page)
		return NULL;

	if (!PageLocked(page))
		BUG();

	/**
	 * 页已经在页高速缓存中。检查它的PG_private标志。
	 */
	if (page_has_buffers(page)) {
		/**
		 * 页已经是缓冲区页。从页描述符的private字段获得第一个缓冲区首部的地址bh。
		 */
		bh = page_buffers(page);
		/**
		 * 检查页中块的大小
		 */
		if (bh->b_size == size) {
			init_page_buffers(page, bdev, block, size);
			return page;
		}
		/**
		 * 页中块的大小有错误，就调用try_to_free_buffers释放缓冲区页的上一个缓冲区首部。
		 */
		if (!try_to_free_buffers(page))
			goto failed;
	}

	/*
	 * Allocate some buffers for this page
	 */
	/**
	 * 页还不是缓冲区页，调用alloc_page_buffers根据页中请求的块大小分配缓冲区首部。
	 * 并将它们插入由b_this_page字段实现的单向循环链表中。
	 * 此外，函数用页描述符的地址初始化缓冲区首部的b_page字段，用块缓冲区在页内的纯属地址或者偏移量初始化b_data字段。
	 */
	bh = alloc_page_buffers(page, size, 0);
	if (!bh)
		goto failed;

	/*
	 * Link the page to the buffers and initialise them.  Take the
	 * lock to be atomic wrt __find_get_block(), which does not
	 * run under the page lock.
	 */
	/**
	 * 在字段private中存放第一个缓冲区首部的地址，把PG_private字段置位。并递增页的使用计数器。
	 */ 
	spin_lock(&inode->i_mapping->private_lock);
	link_dev_buffers(page, bh);
	/**
	 * init_page_buffers函数初始化连接到页的缓冲区首部的b_bdev、b+blocknr和b_bstate字段。
	 * 因为所有的块在磁盘上都是相邻的，因此逻辑块号是连续的。
	 */
	init_page_buffers(page, bdev, block, size);
	spin_unlock(&inode->i_mapping->private_lock);
	return page;

failed:
	BUG();
	unlock_page(page);
	page_cache_release(page);
	return NULL;
}

/*
 * Create buffers for the specified block device block's page.  If
 * that page was dirty, the buffers are set dirty also.
 *
 * Except that's a bug.  Attaching dirty buffers to a dirty
 * blockdev's page can result in filesystem corruption, because
 * some of those buffers may be aliases of filesystem data.
 * grow_dev_page() will go BUG() if this happens.
 */
/**
 * 把块设备缓冲区页添加到页高速缓存中。
 * bdev:		块设备描述符
 * block:		逻辑块号
 * size:		块大小
 */
static inline int
grow_buffers(struct block_device *bdev, sector_t block, int size)
{
	struct page *page;
	pgoff_t index;
	int sizebits;

	/**
	  * 计算数据页在所请求块的块设备中的偏移量
	  */
	sizebits = -1;
	do {
		sizebits++;
	} while ((size << sizebits) < PAGE_SIZE);

	index = block >> sizebits;
	block = index << sizebits;

	/* Create a page with the proper size buffers.. */
	/**
	 * 调用grow_dev_page创建新的块设备缓冲区页。
	 */
	page = grow_dev_page(bdev, block, index, size);
	if (!page)
		return 0;
	/**
	 * 为页解锁，因为在grow_dev_page中为page加了锁。
	 */
	unlock_page(page);
	/**
	 * 递减页的使用计数器(函数find_or_create_page曾递增了计数器)
	 */
	page_cache_release(page);
	return 1;
}

/**
 * 为请求的页分配的一个新的缓冲区。
 */
struct buffer_head *
__getblk_slow(struct block_device *bdev, sector_t block, int size)
{
	/* Size must be multiple of hard sectorsize */
	if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
			(size < 512 || size > PAGE_SIZE))) {
		printk(KERN_ERR "getblk(): invalid block size %d requested\n",
					size);
		printk(KERN_ERR "hardsect size: %d\n",
					bdev_hardsect_size(bdev));

		dump_stack();
		return NULL;
	}

	for (;;) {
		struct buffer_head * bh;

		/**
		 * 确定缓冲区是否已经在页调整缓存中。
		 */
		bh = __find_get_block(bdev, block, size);
		if (bh)
			return bh;

		/**
		 * 不在页高速缓存中，则调用grow_buffers为所请求的页分配一个新的缓冲区页。
		 */
		if (!grow_buffers(bdev, block, size))
			free_more_memory();/* 分配页失败，则试图通过调用函数free_more_memory回收一部分内存 */
	}
}

/*
 * The relationship between dirty buffers and dirty pages:
 *
 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
 * the page is tagged dirty in its radix tree.
 *
 * At all times, the dirtiness of the buffers represents the dirtiness of
 * subsections of the page.  If the page has buffers, the page dirty bit is
 * merely a hint about the true dirty state.
 *
 * When a page is set dirty in its entirety, all its buffers are marked dirty
 * (if the page has buffers).
 *
 * When a buffer is marked dirty, its page is dirtied, but the page's other
 * buffers are not.
 *
 * Also.  When blockdev buffers are explicitly read with bread(), they
 * individually become uptodate.  But their backing page remains not
 * uptodate - even if all of its buffers are uptodate.  A subsequent
 * block_read_full_page() against that page will discover all the uptodate
 * buffers, will set the page uptodate and will perform no I/O.
 */

/**
 * mark_buffer_dirty - mark a buffer_head as needing writeout
 *
 * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
 * backing page dirty, then tag the page as dirty in its address_space's radix
 * tree and then attach the address_space's inode to its superblock's dirty
 * inode list.
 *
 * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
 * mapping->tree_lock and the global inode_lock.
 */
void fastcall mark_buffer_dirty(struct buffer_head *bh)
{
	if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
		__set_page_dirty_nobuffers(bh->b_page);
}

/*
 * Decrement a buffer_head's reference count.  If all buffers against a page
 * have zero reference count, are clean and unlocked, and if the page is clean
 * and unlocked then try_to_free_buffers() may strip the buffers from the page
 * in preparation for freeing it (sometimes, rarely, buffers are removed from
 * a page but it ends up not being freed, and buffers may later be reattached).
 */
/**
 * 当内核控制路径停止访问块缓冲区时，需要调用brelse递减相应的引用计数器。
 * 请注意它与__bforget之间的差别。
 */
void __brelse(struct buffer_head * buf)
{
	if (atomic_read(&buf->b_count)) {
		put_bh(buf);
		return;
	}
	printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
	WARN_ON(1);
}

/*
 * bforget() is like brelse(), except it discards any
 * potentially dirty data.
 */
/**
 * 当内核停止访问块缓冲区时，应该调用__brelse或者__bforget递减相应的引用计数器。
 * 实际上，__bforget还会：从间接块链表（缓冲区首部的b_assoc_buufers字段）中删除块。
 * 并把该缓冲区标记为干净的。因此强制内核忽略对缓冲区所做的任何修改。
 * 但是，实际上缓冲区仍然必须被写回磁盘。
 */
void __bforget(struct buffer_head *bh)
{
	clear_buffer_dirty(bh);
	if (!list_empty(&bh->b_assoc_buffers)) {
		struct address_space *buffer_mapping = bh->b_page->mapping;

		spin_lock(&buffer_mapping->private_lock);
		list_del_init(&bh->b_assoc_buffers);
		spin_unlock(&buffer_mapping->private_lock);
	}
	__brelse(bh);
}
/**
 * 从块设备中读取缓冲区首部
 */
static struct buffer_head *__bread_slow(struct buffer_head *bh)
{
	lock_buffer(bh);
	if (buffer_uptodate(bh)) {
		unlock_buffer(bh);
		return bh;
	} else {
		/**
		 * 首先增加缓冲区的引用计数。
		 */
		get_bh(bh);
		/**
		 * 将end_buffer_read_sync赋给b_end_io。当块设备读取完数据后，会回调此函数。
		 */
		bh->b_end_io = end_buffer_read_sync;
		/**
		 * submit_bh把缓冲区首部传送到通用块层。
		 */
		submit_bh(READ, bh);
		/**
		 * 把当前进入插入到等待队列，直到I/O操作完成。
		 */
		wait_on_buffer(bh);
		if (buffer_uptodate(bh))
			return bh;
	}
	brelse(bh);
	return NULL;
}

/*
 * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
 * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
 * refcount elevated by one when they're in an LRU.  A buffer can only appear
 * once in a particular CPU's LRU.  A single buffer can be present in multiple
 * CPU's LRUs at the same time.
 *
 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
 * sb_find_get_block().
 *
 * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
 * a local interrupt disable for that.
 */

#define BH_LRU_SIZE	8

struct bh_lru {
	struct buffer_head *bhs[BH_LRU_SIZE];
};

/**
 * 在页高速缓存中搜索块时，为了提高性能，内核维持一个小的磁盘高速缓存数组bh_lrus（每CPU变量）。
 * 即所谓的最近最少使用（LRU）块高速缓存。
 * 每个磁盘高速缓存有8个指针，指向被指定CPU最近访问过的缓冲区首部。
 * 对每个CPU的数据进行排序。使指向最后被使用过的那个缓冲区首部的指针索引为0。
 * 相同的缓冲区首部可能出现在几个CPU数组中。
 */
static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};

#ifdef CONFIG_SMP
#define bh_lru_lock()	local_irq_disable()
#define bh_lru_unlock()	local_irq_enable()
#else
#define bh_lru_lock()	preempt_disable()
#define bh_lru_unlock()	preempt_enable()
#endif

static inline void check_irqs_on(void)
{
#ifdef irqs_disabled
	BUG_ON(irqs_disabled());
#endif
}

/*
 * The LRU management algorithm is dopey-but-simple.  Sorry.
 */
static void bh_lru_install(struct buffer_head *bh)
{
	struct buffer_head *evictee = NULL;
	struct bh_lru *lru;

	check_irqs_on();
	bh_lru_lock();
	lru = &__get_cpu_var(bh_lrus);
	if (lru->bhs[0] != bh) {
		struct buffer_head *bhs[BH_LRU_SIZE];
		int in;
		int out = 0;

		get_bh(bh);
		bhs[out++] = bh;
		for (in = 0; in < BH_LRU_SIZE; in++) {
			struct buffer_head *bh2 = lru->bhs[in];

			if (bh2 == bh) {
				__brelse(bh2);
			} else {
				if (out >= BH_LRU_SIZE) {
					BUG_ON(evictee != NULL);
					evictee = bh2;
				} else {
					bhs[out++] = bh2;
				}
			}
		}
		while (out < BH_LRU_SIZE)
			bhs[out++] = NULL;
		memcpy(lru->bhs, bhs, sizeof(bhs));
	}
	bh_lru_unlock();

	if (evictee)
		__brelse(evictee);
}

/*
 * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
 */
static inline struct buffer_head *
lookup_bh_lru(struct block_device *bdev, sector_t block, int size)
{
	struct buffer_head *ret = NULL;
	struct bh_lru *lru;
	int i;

	check_irqs_on();
	bh_lru_lock();
	lru = &__get_cpu_var(bh_lrus);
	for (i = 0; i < BH_LRU_SIZE; i++) {
		struct buffer_head *bh = lru->bhs[i];

		if (bh && bh->b_bdev == bdev &&
				bh->b_blocknr == block && bh->b_size == size) {
			if (i) {
				while (i) {
					lru->bhs[i] = lru->bhs[i - 1];
					i--;
				}
				lru->bhs[0] = bh;
			}
			get_bh(bh);
			ret = bh;
			break;
		}
	}
	bh_lru_unlock();
	return ret;
}

/*
 * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
 * it in the LRU and mark it as accessed.  If it is not present then return
 * NULL
 */
/**
 * 返回页高速缓存中的块缓冲区对应的缓冲区首部的地址。如果不存在，就返回NULL。
 * bdev:	设备描述符。
 * block:	要搜索的块号。
 * size:	块大小。
 */
struct buffer_head *
__find_get_block(struct block_device *bdev, sector_t block, int size)
{
	/**
	 * 检查CPU的LRU块调整缓存数组中是否有一个缓冲区首部。
	 * 如果缓冲区首部在LRU块调整缓存中，就刷新数组中的元素，以便让指针指在第一个位置，递增它的b_count字段。
	 */
	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);

	if (bh == NULL) {
		/**
		 * 缓冲区首部不在LRU块调整缓存中，则调用__find_get_block_slow在高速缓存中搜索。
		 */
		bh = __find_get_block_slow(bdev, block, size);
		if (bh)
			bh_lru_install(bh);
	}
	/**
	 * 如果有必要，就调用mark_page_accessed把缓冲区移至适当的LRU链表中。
	 */
	if (bh)
		touch_buffer(bh);
	return bh;
}
EXPORT_SYMBOL(__find_get_block);

/*
 * __getblk will locate (and, if necessary, create) the buffer_head
 * which corresponds to the passed block_device, block and size. The
 * returned buffer has its reference count incremented.
 *
 * __getblk() cannot fail - it just keeps trying.  If you pass it an
 * illegal block number, __getblk() will happily return a buffer_head
 * which represents the non-existent block.  Very weird.
 *
 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
 * attempt is failing.  FIXME, perhaps?
 */
/**
 * 确定块在页高速缓存中的位置。
 * 它自动完成引用计数的增加，因此高层函数不必再增加块缓冲区的引用计数器。
 *
 * 返回页高速缓存中的块缓冲区对应的缓冲区首部的地址。如果不存在，就分配块设备缓冲区页并返回缓冲区首部指针。
 * bdev:	设备描述符。
 * block:	要搜索的块号。
 * size:	块大小。
 */
 struct buffer_head *
__getblk(struct block_device *bdev, sector_t block, int size)
{
	/**
	 * __find_get_block检查块是否已经在页高速缓存中。如果找到，就返回缓冲区首部地址。
	 */
	struct buffer_head *bh = __find_get_block(bdev, block, size);

	might_sleep();
	/**
	 * 没有在调整缓存首部找到所请求的页。
	 * 则调用grow_buffers为所请求的页分配一个新的缓冲区页。
	 */
	if (bh == NULL)
		bh = __getblk_slow(bdev, block, size);
	return bh;
}
EXPORT_SYMBOL(__getblk);

/*
 * Do async read-ahead on a buffer..
 */
void __breadahead(struct block_device *bdev, sector_t block, int size)
{
	struct buffer_head *bh = __getblk(bdev, block, size);
	ll_rw_block(READA, 1, &bh);
	brelse(bh);
}
EXPORT_SYMBOL(__breadahead);

/**
 *  __bread() - reads a specified block and returns the bh
 *  @block: number of block
 *  @size: size (in bytes) to read
 * 
 *  Reads a specified block, and returns buffer head that contains it.
 *  It returns NULL if the block was unreadable.
 */
/**
 * 该函数与__getblk类似，但是与__getblk相反的是，如果需要的话，它就会在返回缓冲区首部之前从磁盘中读取块的内容。
 */
struct buffer_head *
__bread(struct block_device *bdev, sector_t block, int size)
{
	/**
	 * 调用__getblk在页高速缓存中查找与所请求的块相关的缓冲区页。并获得指向相应的缓冲区首部的指针。
	 */
	struct buffer_head *bh = __getblk(bdev, block, size);

	/**
	 * 块中还没有包含有效数据(BH_Uptodate标志没有被置位)
	 */
	if (!buffer_uptodate(bh))
		/**
		 * __bread_slow会调用通用块层的函数读取数据到内存中。
		 */
		bh = __bread_slow(bh);
	return bh;
}
EXPORT_SYMBOL(__bread);

/*
 * invalidate_bh_lrus() is called rarely - but not only at unmount.
 * This doesn't race because it runs in each cpu either in irq
 * or with preempt disabled.
 */
static void invalidate_bh_lru(void *arg)
{
	struct bh_lru *b = &get_cpu_var(bh_lrus);
	int i;

	for (i = 0; i < BH_LRU_SIZE; i++) {
		brelse(b->bhs[i]);
		b->bhs[i] = NULL;
	}
	put_cpu_var(bh_lrus);
}


static void invalidate_bh_lrus(void)
{
	on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
}

void set_bh_page(struct buffer_head *bh,
		struct page *page, unsigned long offset)
{
	bh->b_page = page;
	if (offset >= PAGE_SIZE)
		BUG();
	if (PageHighMem(page))
		/*
		 * This catches illegal uses and preserves the offset:
		 */
		bh->b_data = (char *)(0 + offset);
	else
		bh->b_data = page_address(page) + offset;
}
EXPORT_SYMBOL(set_bh_page);

/*
 * Called when truncating a buffer on a page completely.
 */
static inline void discard_buffer(struct buffer_head * bh)
{
	lock_buffer(bh);
	clear_buffer_dirty(bh);
	bh->b_bdev = NULL;
	clear_buffer_mapped(bh);
	clear_buffer_req(bh);
	clear_buffer_new(bh);
	clear_buffer_delay(bh);
	unlock_buffer(bh);
}

/**
 * try_to_release_page() - release old fs-specific metadata on a page
 *
 * @page: the page which the kernel is trying to free
 * @gfp_mask: memory allocation flags (and I/O mode)
 *
 * The address_space is to try to release any data against the page
 * (presumably at page->private).  If the release was successful, return `1'.
 * Otherwise return zero.
 *
 * The @gfp_mask argument specifies whether I/O may be performed to release
 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
 *
 * NOTE: @gfp_mask may go away, and this function may become non-blocking.
 */
/**
 * 释放缓冲区页。
 * page:	要释放的页描述符地址。
 */
int try_to_release_page(struct page *page, int gfp_mask)
{
	struct address_space * const mapping = page->mapping;

	BUG_ON(!PageLocked(page));
	/**
	 * 正在试图将页写回磁盘，因此不能将页释放。
	 */ 
	if (PageWriteback(page))
		return 0;

	/**
	 * 如果定义了块设备的address_space对象的releasepage方法，就调用它。
	 * 该回调函数通常是没有定义的。
	 */
	if (mapping && mapping->a_ops->releasepage)
		return mapping->a_ops->releasepage(page, gfp_mask);
	/**
	 * try_to_free_buffers函数依次检查页中的缓冲区首部标志。
	 */
	return try_to_free_buffers(page);
}
EXPORT_SYMBOL(try_to_release_page);

/**
 * block_invalidatepage - invalidate part of all of a buffer-backed page
 *
 * @page: the page which is affected
 * @offset: the index of the truncation point
 *
 * block_invalidatepage() is called when all or part of the page has become
 * invalidatedby a truncate operation.
 *
 * block_invalidatepage() does not have to release all buffers, but it must
 * ensure that no dirty buffer is left outside @offset and that no I/O
 * is underway against any of the blocks which are outside the truncation
 * point.  Because the caller is about to free (and possibly reuse) those
 * blocks on-disk.
 */
int block_invalidatepage(struct page *page, unsigned long offset)
{
	struct buffer_head *head, *bh, *next;
	unsigned int curr_off = 0;
	int ret = 1;

	BUG_ON(!PageLocked(page));
	if (!page_has_buffers(page))
		goto out;

	head = page_buffers(page);
	bh = head;
	do {
		unsigned int next_off = curr_off + bh->b_size;
		next = bh->b_this_page;

		/*
		 * is this block fully invalidated?
		 */
		if (offset <= curr_off)
			discard_buffer(bh);
		curr_off = next_off;
		bh = next;
	} while (bh != head);

	/*
	 * We release buffers only if the entire page is being invalidated.
	 * The get_block cached value has been unconditionally invalidated,
	 * so real IO is not possible anymore.
	 */
	if (offset == 0)
		ret = try_to_release_page(page, 0);
out:
	return ret;
}
EXPORT_SYMBOL(block_invalidatepage);

/*
 * We attach and possibly dirty the buffers atomically wrt
 * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
 * is already excluded via the page lock.
 */
/**
 * 为页所含块缓冲区分配缓冲区首部。
 */
void create_empty_buffers(struct page *page,
			unsigned long blocksize, unsigned long b_state)
{
	struct buffer_head *bh, *head, *tail;

	head = alloc_page_buffers(page, blocksize, 1);
	bh = head;
	do {
		bh->b_state |= b_state;
		tail = bh;
		bh = bh->b_this_page;
	} while (bh);
	tail->b_this_page = head;

	spin_lock(&page->mapping->private_lock);
	if (PageUptodate(page) || PageDirty(page)) {
		bh = head;
		do {
			if (PageDirty(page))
				set_buffer_dirty(bh);
			if (PageUptodate(page))
				set_buffer_uptodate(bh);
			bh = bh->b_this_page;
		} while (bh != head);
	}
	attach_page_buffers(page, head);
	spin_unlock(&page->mapping->private_lock);
}
EXPORT_SYMBOL(create_empty_buffers);

/*
 * We are taking a block for data and we don't want any output from any
 * buffer-cache aliases starting from return from that function and
 * until the moment when something will explicitly mark the buffer
 * dirty (hopefully that will not happen until we will free that block ;-)
 * We don't even need to mark it not-uptodate - nobody can expect
 * anything from a newly allocated buffer anyway. We used to used
 * unmap_buffer() for such invalidation, but that was wrong. We definitely
 * don't want to mark the alias unmapped, for example - it would confuse
 * anyone who might pick it with bread() afterwards...
 *
 * Also..  Note that bforget() doesn't lock the buffer.  So there can
 * be writeout I/O going on against recently-freed buffers.  We don't
 * wait on that I/O in bforget() - it's more efficient to wait on the I/O
 * only if we really need to.  That happens here.
 */
void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
{
	struct buffer_head *old_bh;

	might_sleep();

	old_bh = __find_get_block_slow(bdev, block, 0);
	if (old_bh) {
		clear_buffer_dirty(old_bh);
		wait_on_buffer(old_bh);
		clear_buffer_req(old_bh);
		__brelse(old_bh);
	}
}
EXPORT_SYMBOL(unmap_underlying_metadata);

/*
 * NOTE! All mapped/uptodate combinations are valid:
 *
 *	Mapped	Uptodate	Meaning
 *
 *	No	No		"unknown" - must do get_block()
 *	No	Yes		"hole" - zero-filled
 *	Yes	No		"allocated" - allocated on disk, not read in
 *	Yes	Yes		"valid" - allocated and up-to-date in memory.
 *
 * "Dirty" is valid only with the last case (mapped+uptodate).
 */

/*
 * While block_write_full_page is writing back the dirty buffers under
 * the page lock, whoever dirtied the buffers may decide to clean them
 * again at any time.  We handle that by only looking at the buffer
 * state inside lock_buffer().
 *
 * If block_write_full_page() is called for regular writeback
 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
 * locked buffer.   This only can happen if someone has written the buffer
 * directly, with submit_bh().  At the address_space level PageWriteback
 * prevents this contention from occurring.
 */
static int __block_write_full_page(struct inode *inode, struct page *page,
			get_block_t *get_block, struct writeback_control *wbc)
{
	int err;
	sector_t block;
	sector_t last_block;
	struct buffer_head *bh, *head;
	int nr_underway = 0;

	BUG_ON(!PageLocked(page));

	/* 计算文件最后一个块的编号 */
	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
	/**
	 * 象block_read_full_page一样，如果还没有在缓冲区页中，就分配缓冲区首部
	 */
	if (!page_has_buffers(page)) {
		create_empty_buffers(page, 1 << inode->i_blkbits,
					(1 << BH_Dirty)|(1 << BH_Uptodate));
	}

	/*
	 * Be very careful.  We have no exclusion from __set_page_dirty_buffers
	 * here, and the (potentially unmapped) buffers may become dirty at
	 * any time.  If a buffer becomes dirty here after we've inspected it
	 * then we just miss that fact, and the page stays dirty.
	 *
	 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
	 * handle that here by just cleaning them.
	 */

	block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
	head = page_buffers(page);
	bh = head;

	/*
	 * Get all the dirty buffers mapped to disk addresses and
	 * handle any aliases from the underlying blockdev's mapping.
	 */
	/* 确保已经映射了所有缓冲区 */
	do {
		if (block > last_block) {/* 超过了文件块 */
			/*
			 * mapped buffers outside i_size will occur, because
			 * this page can be outside i_size when there is a
			 * truncate in progress.
			 */
			/*
			 * The buffer was zeroed by block_write_full_page()
			 */
			/* 上层调用者必须清除了页面内容，这里将缓冲区脏标记清除，并设置uptodate标志 */
			clear_buffer_dirty(bh);
			set_buffer_uptodate(bh);
		} else if (!buffer_mapped(bh) && buffer_dirty(bh)) {/* 如果还没有映射到磁盘，并且缓冲区为脏 */
			/* 获得磁盘逻辑块，必要时创建间接块 */
			err = get_block(inode, block, bh, 1);
			if (err)
				goto recover;
			if (buffer_new(bh)) {
				/* blockdev mappings never come here */
				clear_buffer_new(bh);
				unmap_underlying_metadata(bh->b_bdev,
							bh->b_blocknr);
			}
		}
		/* 处理下一个缓冲区首部 */
		bh = bh->b_this_page;
		block++;
	} while (bh != head);

	/* 锁住缓冲区 */
	do {
		get_bh(bh);
		if (!buffer_mapped(bh))/* 超出文件大小的缓冲区，不需要处理 */
			continue;
		/*
		 * If it's a fully non-blocking write attempt and we cannot
		 * lock the buffer then redirty the page.  Note that this can
		 * potentially cause a busy-wait loop from pdflush and kswapd
		 * activity, but those code paths have their own higher-level
		 * throttling.
		 */
		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {/* 要求完整的回写文件，或者允许阻塞 */
			/* 锁定缓冲区 */
			lock_buffer(bh);
		} else if (test_set_buffer_locked(bh)) {/* 尝试获得锁，如果不能获得则继续处理下一个缓冲区 */
			redirty_page_for_writepage(wbc, page);/* 重新标记页面为脏 */
			continue;
		}
		/* 运行到此，已经成功获得缓冲区首部的锁 */
		if (test_clear_buffer_dirty(bh)) {
			/* 如果页面为脏，这里设置其回调函数，并设置BH_Async_Write标志，稍后将其写入磁盘 */
			mark_buffer_async_write(bh);
		} else {
			unlock_buffer(bh);/* 如果不是脏缓冲区，则解锁，不必写到磁盘上 */
		}
	} while ((bh = bh->b_this_page) != head);

	/*
	 * The page and its buffers are protected by PageWriteback(), so we can
	 * drop the bh refcounts early.
	 */
	/* 确保没有设置回写标志,并将回写标志置上，回写标志将保护页面和它的块缓冲区 */
	BUG_ON(PageWriteback(page));
	set_page_writeback(page);
	unlock_page(page);

	/**
	 * 对每缓冲区调用submit_bh函数来执行写操作。
	 */
	do {
		struct buffer_head *next = bh->b_this_page;
		if (buffer_async_write(bh)) {/* 缓冲区脏，需要提交 */
			submit_bh(WRITE, bh);
			nr_underway++;
		}
		put_bh(bh);
		bh = next;
	} while (bh != head);

	err = 0;
done:
	if (nr_underway == 0) {
		/*
		 * The page was marked dirty, but the buffers were
		 * clean.  Someone wrote them back by hand with
		 * ll_rw_block/submit_bh.  A rare case.
		 */
		int uptodate = 1;
		do {
			if (!buffer_uptodate(bh)) {
				uptodate = 0;
				break;
			}
			bh = bh->b_this_page;
		} while (bh != head);
		if (uptodate)
			SetPageUptodate(page);
		end_page_writeback(page);
		/*
		 * The page and buffer_heads can be released at any time from
		 * here on.
		 */
		wbc->pages_skipped++;	/* We didn't write this page */
	}
	return err;

recover:
	/*
	 * ENOSPC, or some other error.  We may already have added some
	 * blocks to the file, so we need to write these out to avoid
	 * exposing stale data.
	 * The page is currently locked and not marked for writeback
	 */
	bh = head;
	/* Recovery: lock and submit the mapped buffers */
	do {
		get_bh(bh);
		if (buffer_mapped(bh) && buffer_dirty(bh)) {
			lock_buffer(bh);
			mark_buffer_async_write(bh);
		} else {
			/*
			 * The buffer may have been set dirty during
			 * attachment to a dirty page.
			 */
			clear_buffer_dirty(bh);
		}
	} while ((bh = bh->b_this_page) != head);
	SetPageError(page);
	BUG_ON(PageWriteback(page));
	set_page_writeback(page);
	unlock_page(page);
	do {
		struct buffer_head *next = bh->b_this_page;
		if (buffer_async_write(bh)) {
			clear_buffer_dirty(bh);
			submit_bh(WRITE, bh);
			nr_underway++;
		}
		put_bh(bh);
		bh = next;
	} while (bh != head);
	goto done;
}

/**
 * 为文件页的缓冲区和缓冲区首部做准备
 */
static int __block_prepare_write(struct inode *inode, struct page *page,
		unsigned from, unsigned to, get_block_t *get_block)
{
	unsigned block_start, block_end;
	sector_t block;
	int err = 0;
	unsigned blocksize, bbits;
	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;

	BUG_ON(!PageLocked(page));
	BUG_ON(from > PAGE_CACHE_SIZE);
	BUG_ON(to > PAGE_CACHE_SIZE);
	BUG_ON(from > to);

	blocksize = 1 << inode->i_blkbits;
	/**
	 * 检查某页是否是一个缓冲区页（如果是则PG_Private标志置位）。
	 * 如果没有设置该标志，则调用create_empty_buffers为页中所有的缓冲区分配缓冲区首部
	 */
	if (!page_has_buffers(page))
		create_empty_buffers(page, blocksize, 0);
	head = page_buffers(page);

	bbits = inode->i_blkbits;
	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);

	/**
	 * 对页中包含的缓冲区对应的每个缓冲区首部，及受写操作影响的每个缓冲区首部。
	 */
	for(bh = head, block_start = 0; bh != head || !block_start;
	    block++, block_start=block_end, bh = bh->b_this_page) {
		block_end = block_start + blocksize;
		if (block_end <= from || block_start >= to) {
			if (PageUptodate(page)) {
				if (!buffer_uptodate(bh))
					set_buffer_uptodate(bh);
			}
			continue;
		}
		/**
		 * 如果BH_New标志置位，就将它清0。
		 */
		if (buffer_new(bh))
			clear_buffer_new(bh);
		/**
		 * 如果BH_New标志已经清0，则
		 */
		if (!buffer_mapped(bh)) {
			/**
			 * get_block是传递过来的依赖于文件系统的函数。查看这个文件系统磁盘数据结构并查找缓冲区的逻辑块号。
			 * （相对于分区的起始位置而不是普通文件的起始位置）
			 * 与文件系统相关的函数把这个数存放在对应缓冲区首部的b_blicknr字段。并设置它的BH_Mapped标志。
			 * 与文件系统相关的函数可能为文件分配一个新的物理块（如：访问的块掉进文件的“洞”中）。这种情况下，设置BH_New值
			 */
			err = get_block(inode, block, bh, 1);
			if (err)
				goto out;
			/**
			 * 检查BH_New标志的值
			 */
			if (buffer_new(bh)) {
				clear_buffer_new(bh);
				/**
				 * 设置了BH_New标志的值，调用unmap_underlying_metadata检查页高速缓存内的某个块设备缓冲区页中是否包含指向磁盘同一块的一个缓冲区
				 * 尽管可能性不大，但在一个用户直接向块设备文件写数据块时，还是会出现这种情况，从而越过文件系统。
				 * 该函数实际上调用__find_get_block在页高速缓存内查找一个旧块。如果找到一块，函数将BH_Dirty标志清0并等待直到该缓冲区的IO传输完毕。
				 */
				unmap_underlying_metadata(bh->b_bdev,
							bh->b_blocknr);
				if (PageUptodate(page)) {
					set_buffer_uptodate(bh);
					continue;
				}
				/**
				 * 另外，如果写操作不对整个缓冲区重写，则用0填写未写区域。
				 */
				if (block_end > to || block_start < from) {
					void *kaddr;

					kaddr = kmap_atomic(page, KM_USER0);
					if (block_end > to)
						memset(kaddr+to, 0,
							block_end-to);
					if (block_start < from)
						memset(kaddr+block_start,
							0, from-block_start);
					flush_dcache_page(page);
					kunmap_atomic(kaddr, KM_USER0);
				}
				continue;
			}
		}
		if (PageUptodate(page)) {
			if (!buffer_uptodate(bh))
				set_buffer_uptodate(bh);
			continue; 
		}
		/**
		 * 不重写整个缓冲区，且它的BH_Delay和BH_Uptodate标志未置位
		 * （即已经在磁盘文件系统数据结构中分配了块，但是RAM的缓冲区中并没有有效的数据映像）
		 * 函数对该块调用ll_rw_block从磁盘读取它的内容
		 */
		if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
		     (block_start < from || block_end > to)) {
			ll_rw_block(READ, 1, &bh);
			*wait_bh++=bh;
		}
	}
	/*
	 * If we issued read requests - let them complete.
	 */
	/**
	 * 阻塞当前进程，直到ll_rw_block要求的所有读操作都全部完成。
	 */
	while(wait_bh > wait) {
		wait_on_buffer(*--wait_bh);
		if (!buffer_uptodate(*wait_bh))
			return -EIO;
	}
	return 0;
out:
	/*
	 * Zero out any newly allocated blocks to avoid exposing stale
	 * data.  If BH_New is set, we know that the block was newly
	 * allocated in the above loop.
	 */
	bh = head;
	block_start = 0;
	do {
		block_end = block_start+blocksize;
		if (block_end <= from)
			goto next_bh;
		if (block_start >= to)
			break;
		if (buffer_new(bh)) {
			void *kaddr;

			clear_buffer_new(bh);
			kaddr = kmap_atomic(page, KM_USER0);
			memset(kaddr+block_start, 0, bh->b_size);
			kunmap_atomic(kaddr, KM_USER0);
			set_buffer_uptodate(bh);
			mark_buffer_dirty(bh);
		}
next_bh:
		block_start = block_end;
		bh = bh->b_this_page;
	} while (bh != head);
	return err;
}

static int __block_commit_write(struct inode *inode, struct page *page,
		unsigned from, unsigned to)
{
	unsigned block_start, block_end;
	int partial = 0;
	unsigned blocksize;
	struct buffer_head *bh, *head;

	blocksize = 1 << inode->i_blkbits;

	/**
	 * 考虑页中受写操作影响的所有缓冲区。对于其中的每个缓冲区，将对应缓冲区首部的BH_Uptodate和BH_Dirty标志置位。
	 */
	for(bh = head = page_buffers(page), block_start = 0;
	    bh != head || !block_start;
	    block_start=block_end, bh = bh->b_this_page) {
		block_end = block_start + blocksize;
		if (block_end <= from || block_start >= to) {
			if (!buffer_uptodate(bh))
				partial = 1;
		} else {
			set_buffer_uptodate(bh);
			mark_buffer_dirty(bh);
		}
	}

	/*
	 * If this is a partial write which happened to make all buffers
	 * uptodate then we can optimize away a bogus readpage() for
	 * the next read(). Here we 'discover' whether the page went
	 * uptodate as a result of this (potentially partial) write.
	 */
	/**
	 * 如果缓冲区页中的所有缓冲区是最新的，则将PG_uptodate标志置位。
	 */
	if (!partial)
		SetPageUptodate(page);
	return 0;
}

/*
 * Generic "read page" function for block devices that have the normal
 * get_block functionality. This is most of the block device filesystems.
 * Reads the page asynchronously --- the unlock_buffer() and
 * set/clear_buffer_uptodate() functions propagate buffer state into the
 * page struct once IO has completed.
 */
/**
 * 块设备文件的readpage方法总是相同的，它是由blkdev_readpage，而blkdev_readpage会调用本方法。
 * get_block把相对于文件开始处的文件块号转换为相对于块设备开始处的逻辑块号。
 * 不过对块设备文件来说，这两个数是一致的。
 * block_read_full_page以一次读一块的方式读一页数据。
 * 当读块设备文件和磁盘上块不相邻的普通文件时都使用该函数。
 */
int block_read_full_page(struct page *page, get_block_t *get_block)
{
	struct inode *inode = page->mapping->host;
	sector_t iblock, lblock;
	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
	unsigned int blocksize;
	int nr, i;
	int fully_mapped = 1;

	if (!PageLocked(page))
		PAGE_BUG(page);
	blocksize = 1 << inode->i_blkbits;
	/**
	 * 检查页描述符的标志PG_private，如果置位，则该页与描述组成页的块的缓冲区首部链表相关（把块存放在页高速缓存中）
	 * 否则，调用create_empty_buffers为该页所含的所有块缓冲区分配缓冲区首部。
	 */
	if (!page_has_buffers(page))
		create_empty_buffers(page, blocksize, 0);
	head = page_buffers(page);

	/**
	 * 从相对于页的文件偏移量(page->index)计算出页中第一块的文件块号
	 */
	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
	lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
	bh = head;
	nr = 0;
	i = 0;

	/**
	 * 对该页中每个缓冲区首部，执行以下操作
	 */
	do {
		/**
		 * 如果BH_Uptodate置位，则跳过该缓冲区继续处理该页的下一个缓冲区
		 */
		if (buffer_uptodate(bh))
			continue;

		/**
		 * 如果BH_Mapped未置位
		 */
		if (!buffer_mapped(bh)) {
			fully_mapped = 0;
			/**
			 * 并且该块没有超出文件尾
			 */
			if (iblock < lblock) {
				/**
				 * 调用依赖于文件系统的get_block函数。该函数做为参数传入。
				 * 对于普通文件，该函数在文件系统的磁盘数据结构中查找，得到相对于磁盘或分区开始处的缓冲区逻辑块号
				 * 对于块设备文件，不同的是该函数把文件块号当作逻辑块号。
				 * 对这两种情形，函数都将逻辑块号存放在相应缓冲区首部的b_blocknr字段中，并将标志BH_Mapped标志置位
				 */
				if (get_block(inode, iblock, bh, 0))
					SetPageError(page);
			}
			if (!buffer_mapped(bh)) {
				void *kaddr = kmap_atomic(page, KM_USER0);
				memset(kaddr + i * blocksize, 0, blocksize);
				flush_dcache_page(page);
				kunmap_atomic(kaddr, KM_USER0);
				set_buffer_uptodate(bh);
				continue;
			}
			/*
			 * get_block() might have updated the buffer
			 * synchronously
			 */
			/**
			 * 再次检查BH_Uptodate标志。因为依赖于文件系统的get_block函数可能已经角发了块IO操作而更新了缓冲区。
			 * 如果BH_Uptodate置位了，就继续处理下一缓冲区。
			 */
			if (buffer_uptodate(bh))
				continue;
		}
		/**
		 * 将缓冲区首部的地址存放在局部数组att中，继续该页的下一个缓冲区
		 */
		arr[nr++] = bh;
	} while (i++, iblock++, (bh = bh->b_this_page) != head);

	/**
	 * 如果没有遇到文件洞，则设置PG_mappedtodisk标志
	 */
	if (fully_mapped)
		SetPageMappedToDisk(page);

	/**
	 * arr中存放了一些缓冲区首部的地址，与其对应的缓冲区的内容不是最新的。
	 * 如果数组为空，那么页中所有缓冲区都是有效的。因此，设置页的PG_uptodate标志
	 */
	if (!nr) {
		/*
		 * All buffers are uptodate - we can set the page uptodate
		 * as well. But not if get_block() returned an error.
		 */
		if (!PageError(page))
			SetPageUptodate(page);
		/**
		 * 设置PG_uptodate标志后，调用unlock_page并返回。
		 */
		unlock_page(page);
		return 0;
	}

	/* Stage two: lock the buffers */
	/**
	 * arr非空，对数组中的每个缓冲区首部执行以下操作
	 */
	for (i = 0; i < nr; i++) {
		bh = arr[i];
		/**
		 * 将BH_Lock置位。一旦置位，函数将一直等待该缓冲区释放。
		 */
		lock_buffer(bh);
		/**
		 * 将缓冲区首部的b_end_io字段设为end_buffer_async_read函数的地址。
		 * 并将缓冲区首部的BH_Async_Read标志置位
		 */
		mark_buffer_async_read(bh);
	}

	/*
	 * Stage 3: start the IO.  Check for uptodateness
	 * inside the buffer lock in case another process reading
	 * the underlying blockdev brought it uptodate (the sct fix).
	 */
	/**
	 * 对arr中的每个缓冲区首部，调用submit_bh，将操作类型设为READ后，它会触发相应块的IO数据传输。
	 */
	for (i = 0; i < nr; i++) {
		bh = arr[i];
		if (buffer_uptodate(bh))
			end_buffer_async_read(bh, 1);
		else
			submit_bh(READ, bh);
	}
	return 0;
}

/* utility function for filesystems that need to do work on expanding
 * truncates.  Uses prepare/commit_write to allow the filesystem to
 * deal with the hole.  
 */
int generic_cont_expand(struct inode *inode, loff_t size)
{
	struct address_space *mapping = inode->i_mapping;
	struct page *page;
	unsigned long index, offset, limit;
	int err;

	err = -EFBIG;
        limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
	if (limit != RLIM_INFINITY && size > (loff_t)limit) {
		send_sig(SIGXFSZ, current, 0);
		goto out;
	}
	if (size > inode->i_sb->s_maxbytes)
		goto out;

	offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */

	/* ugh.  in prepare/commit_write, if from==to==start of block, we 
	** skip the prepare.  make sure we never send an offset for the start
	** of a block
	*/
	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
		offset++;
	}
	index = size >> PAGE_CACHE_SHIFT;
	err = -ENOMEM;
	page = grab_cache_page(mapping, index);
	if (!page)
		goto out;
	err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
	if (!err) {
		err = mapping->a_ops->commit_write(NULL, page, offset, offset);
	}
	unlock_page(page);
	page_cache_release(page);
	if (err > 0)
		err = 0;
out:
	return err;
}

/*
 * For moronic filesystems that do not allow holes in file.
 * We may have to extend the file.
 */

int cont_prepare_write(struct page *page, unsigned offset,
		unsigned to, get_block_t *get_block, loff_t *bytes)
{
	struct address_space *mapping = page->mapping;
	struct inode *inode = mapping->host;
	struct page *new_page;
	pgoff_t pgpos;
	long status;
	unsigned zerofrom;
	unsigned blocksize = 1 << inode->i_blkbits;
	void *kaddr;

	while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
		status = -ENOMEM;
		new_page = grab_cache_page(mapping, pgpos);
		if (!new_page)
			goto out;
		/* we might sleep */
		if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
			unlock_page(new_page);
			page_cache_release(new_page);
			continue;
		}
		zerofrom = *bytes & ~PAGE_CACHE_MASK;
		if (zerofrom & (blocksize-1)) {
			*bytes |= (blocksize-1);
			(*bytes)++;
		}
		status = __block_prepare_write(inode, new_page, zerofrom,
						PAGE_CACHE_SIZE, get_block);
		if (status)
			goto out_unmap;
		kaddr = kmap_atomic(new_page, KM_USER0);
		memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
		flush_dcache_page(new_page);
		kunmap_atomic(kaddr, KM_USER0);
		generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
		unlock_page(new_page);
		page_cache_release(new_page);
	}

	if (page->index < pgpos) {
		/* completely inside the area */
		zerofrom = offset;
	} else {
		/* page covers the boundary, find the boundary offset */
		zerofrom = *bytes & ~PAGE_CACHE_MASK;

		/* if we will expand the thing last block will be filled */
		if (to > zerofrom && (zerofrom & (blocksize-1))) {
			*bytes |= (blocksize-1);
			(*bytes)++;
		}

		/* starting below the boundary? Nothing to zero out */
		if (offset <= zerofrom)
			zerofrom = offset;
	}
	status = __block_prepare_write(inode, page, zerofrom, to, get_block);
	if (status)
		goto out1;
	if (zerofrom < offset) {
		kaddr = kmap_atomic(page, KM_USER0);
		memset(kaddr+zerofrom, 0, offset-zerofrom);
		flush_dcache_page(page);
		kunmap_atomic(kaddr, KM_USER0);
		__block_commit_write(inode, page, zerofrom, offset);
	}
	return 0;
out1:
	ClearPageUptodate(page);
	return status;

out_unmap:
	ClearPageUptodate(new_page);
	unlock_page(new_page);
	page_cache_release(new_page);
out:
	return status;
}

/**
 * 为文件页的缓冲区和缓冲区首部做准备
 */
int block_prepare_write(struct page *page, unsigned from, unsigned to,
			get_block_t *get_block)
{
	struct inode *inode = page->mapping->host;
	int err = __block_prepare_write(inode, page, from, to, get_block);
	if (err)
		ClearPageUptodate(page);
	return err;
}

int block_commit_write(struct page *page, unsigned from, unsigned to)
{
	struct inode *inode = page->mapping->host;
	__block_commit_write(inode,page,from,to);
	return 0;
}

/**
 * address_space对象的commit_write方法。这个方法几乎适用于所有非日志型磁盘文件系统。
 */
int generic_commit_write(struct file *file, struct page *page,
		unsigned from, unsigned to)
{
	struct inode *inode = page->mapping->host;
	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
	
	__block_commit_write(inode,page,from,to);
	/*
	 * No need to use i_size_read() here, the i_size
	 * cannot change under us because we hold i_sem.
	 */
	/**
	 * 检查写操作是否将文件增大，如果增大则更新文件索引结点对象的i_size字段。
	 */
	if (pos > inode->i_size) {
		i_size_write(inode, pos);
		mark_inode_dirty(inode);
	}
	return 0;
}


/*
 * nobh_prepare_write()'s prereads are special: the buffer_heads are freed
 * immediately, while under the page lock.  So it needs a special end_io
 * handler which does not touch the bh after unlocking it.
 *
 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
 * a race there is benign: unlock_buffer() only use the bh's address for
 * hashing after unlocking the buffer, so it doesn't actually touch the bh
 * itself.
 */
static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
{
	if (uptodate) {
		set_buffer_uptodate(bh);
	} else {
		/* This happens, due to failed READA attempts. */
		clear_buffer_uptodate(bh);
	}
	unlock_buffer(bh);
}

/*
 * On entry, the page is fully not uptodate.
 * On exit the page is fully uptodate in the areas outside (from,to)
 */
int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
			get_block_t *get_block)
{
	struct inode *inode = page->mapping->host;
	const unsigned blkbits = inode->i_blkbits;
	const unsigned blocksize = 1 << blkbits;
	struct buffer_head map_bh;
	struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
	unsigned block_in_page;
	unsigned block_start;
	sector_t block_in_file;
	char *kaddr;
	int nr_reads = 0;
	int i;
	int ret = 0;
	int is_mapped_to_disk = 1;
	int dirtied_it = 0;

	if (PageMappedToDisk(page))
		return 0;

	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
	map_bh.b_page = page;

	/*
	 * We loop across all blocks in the page, whether or not they are
	 * part of the affected region.  This is so we can discover if the
	 * page is fully mapped-to-disk.
	 */
	for (block_start = 0, block_in_page = 0;
		  block_start < PAGE_CACHE_SIZE;
		  block_in_page++, block_start += blocksize) {
		unsigned block_end = block_start + blocksize;
		int create;

		map_bh.b_state = 0;
		create = 1;
		if (block_start >= to)
			create = 0;
		ret = get_block(inode, block_in_file + block_in_page,
					&map_bh, create);
		if (ret)
			goto failed;
		if (!buffer_mapped(&map_bh))
			is_mapped_to_disk = 0;
		if (buffer_new(&map_bh))
			unmap_underlying_metadata(map_bh.b_bdev,
							map_bh.b_blocknr);
		if (PageUptodate(page))
			continue;
		if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) {
			kaddr = kmap_atomic(page, KM_USER0);
			if (block_start < from) {
				memset(kaddr+block_start, 0, from-block_start);
				dirtied_it = 1;
			}
			if (block_end > to) {
				memset(kaddr + to, 0, block_end - to);
				dirtied_it = 1;
			}
			flush_dcache_page(page);
			kunmap_atomic(kaddr, KM_USER0);
			continue;
		}
		if (buffer_uptodate(&map_bh))
			continue;	/* reiserfs does this */
		if (block_start < from || block_end > to) {
			struct buffer_head *bh = alloc_buffer_head(GFP_NOFS);

			if (!bh) {
				ret = -ENOMEM;
				goto failed;
			}
			bh->b_state = map_bh.b_state;
			atomic_set(&bh->b_count, 0);
			bh->b_this_page = NULL;
			bh->b_page = page;
			bh->b_blocknr = map_bh.b_blocknr;
			bh->b_size = blocksize;
			bh->b_data = (char *)(long)block_start;
			bh->b_bdev = map_bh.b_bdev;
			bh->b_private = NULL;
			read_bh[nr_reads++] = bh;
		}
	}

	if (nr_reads) {
		struct buffer_head *bh;

		/*
		 * The page is locked, so these buffers are protected from
		 * any VM or truncate activity.  Hence we don't need to care
		 * for the buffer_head refcounts.
		 */
		for (i = 0; i < nr_reads; i++) {
			bh = read_bh[i];
			lock_buffer(bh);
			bh->b_end_io = end_buffer_read_nobh;
			submit_bh(READ, bh);
		}
		for (i = 0; i < nr_reads; i++) {
			bh = read_bh[i];
			wait_on_buffer(bh);
			if (!buffer_uptodate(bh))
				ret = -EIO;
			free_buffer_head(bh);
			read_bh[i] = NULL;
		}
		if (ret)
			goto failed;
	}

	if (is_mapped_to_disk)
		SetPageMappedToDisk(page);
	SetPageUptodate(page);

	/*
	 * Setting the page dirty here isn't necessary for the prepare_write
	 * function - commit_write will do that.  But if/when this function is
	 * used within the pagefault handler to ensure that all mmapped pages
	 * have backing space in the filesystem, we will need to dirty the page
	 * if its contents were altered.
	 */
	if (dirtied_it)
		set_page_dirty(page);

	return 0;

failed:
	for (i = 0; i < nr_reads; i++) {
		if (read_bh[i])
			free_buffer_head(read_bh[i]);
	}

	/*
	 * Error recovery is pretty slack.  Clear the page and mark it dirty
	 * so we'll later zero out any blocks which _were_ allocated.
	 */
	kaddr = kmap_atomic(page, KM_USER0);
	memset(kaddr, 0, PAGE_CACHE_SIZE);
	kunmap_atomic(kaddr, KM_USER0);
	SetPageUptodate(page);
	set_page_dirty(page);
	return ret;
}
EXPORT_SYMBOL(nobh_prepare_write);

int nobh_commit_write(struct file *file, struct page *page,
		unsigned from, unsigned to)
{
	struct inode *inode = page->mapping->host;
	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;

	set_page_dirty(page);
	if (pos > inode->i_size) {
		i_size_write(inode, pos);
		mark_inode_dirty(inode);
	}
	return 0;
}
EXPORT_SYMBOL(nobh_commit_write);

/*
 * This function assumes that ->prepare_write() uses nobh_prepare_write().
 */
int nobh_truncate_page(struct address_space *mapping, loff_t from)
{
	struct inode *inode = mapping->host;
	unsigned blocksize = 1 << inode->i_blkbits;
	pgoff_t index = from >> PAGE_CACHE_SHIFT;
	unsigned offset = from & (PAGE_CACHE_SIZE-1);
	unsigned to;
	struct page *page;
	struct address_space_operations *a_ops = mapping->a_ops;
	char *kaddr;
	int ret = 0;

	if ((offset & (blocksize - 1)) == 0)
		goto out;

	ret = -ENOMEM;
	page = grab_cache_page(mapping, index);
	if (!page)
		goto out;

	to = (offset + blocksize) & ~(blocksize - 1);
	ret = a_ops->prepare_write(NULL, page, offset, to);
	if (ret == 0) {
		kaddr = kmap_atomic(page, KM_USER0);
		memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
		flush_dcache_page(page);
		kunmap_atomic(kaddr, KM_USER0);
		set_page_dirty(page);
	}
	unlock_page(page);
	page_cache_release(page);
out:
	return ret;
}
EXPORT_SYMBOL(nobh_truncate_page);

int block_truncate_page(struct address_space *mapping,
			loff_t from, get_block_t *get_block)
{
	pgoff_t index = from >> PAGE_CACHE_SHIFT;
	unsigned offset = from & (PAGE_CACHE_SIZE-1);
	unsigned blocksize;
	pgoff_t iblock;
	unsigned length, pos;
	struct inode *inode = mapping->host;
	struct page *page;
	struct buffer_head *bh;
	void *kaddr;
	int err;

	blocksize = 1 << inode->i_blkbits;
	length = offset & (blocksize - 1);

	/* Block boundary? Nothing to do */
	if (!length)
		return 0;

	length = blocksize - length;
	iblock = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
	
	page = grab_cache_page(mapping, index);
	err = -ENOMEM;
	if (!page)
		goto out;

	if (!page_has_buffers(page))
		create_empty_buffers(page, blocksize, 0);

	/* Find the buffer that contains "offset" */
	bh = page_buffers(page);
	pos = blocksize;
	while (offset >= pos) {
		bh = bh->b_this_page;
		iblock++;
		pos += blocksize;
	}

	err = 0;
	if (!buffer_mapped(bh)) {
		err = get_block(inode, iblock, bh, 0);
		if (err)
			goto unlock;
		/* unmapped? It's a hole - nothing to do */
		if (!buffer_mapped(bh))
			goto unlock;
	}

	/* Ok, it's mapped. Make sure it's up-to-date */
	if (PageUptodate(page))
		set_buffer_uptodate(bh);

	if (!buffer_uptodate(bh) && !buffer_delay(bh)) {
		err = -EIO;
		ll_rw_block(READ, 1, &bh);
		wait_on_buffer(bh);
		/* Uhhuh. Read error. Complain and punt. */
		if (!buffer_uptodate(bh))
			goto unlock;
	}

	kaddr = kmap_atomic(page, KM_USER0);
	memset(kaddr + offset, 0, length);
	flush_dcache_page(page);
	kunmap_atomic(kaddr, KM_USER0);

	mark_buffer_dirty(bh);
	err = 0;

unlock:
	unlock_page(page);
	page_cache_release(page);
out:
	return err;
}

/*
 * The generic ->writepage function for buffer-backed address_spaces
 */
/**
 * Ext2文件系统所实现的writepage方法是一个通用的block_write_full_page的封装函数，并会传递一个get_block参数
 * 对块设备来说，它不会直接调用block_write_full_page，而是调用block_write_full_page的封装函数blkdev_writepage函数实现writepage
 */
int block_write_full_page(struct page *page, get_block_t *get_block,
			struct writeback_control *wbc)
{
	struct inode * const inode = page->mapping->host;
	loff_t i_size = i_size_read(inode);
	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
	unsigned offset;
	void *kaddr;

	/* Is the page fully inside i_size? */
	/* 页面内容完全在文件范围内 */
	if (page->index < end_index)
		/* 写整个页面 */
		return __block_write_full_page(inode, page, get_block, wbc);

	/* Is the page fully outside i_size? (truncate in progress) */
	offset = i_size & (PAGE_CACHE_SIZE-1);
	if (page->index >= end_index+1 || !offset) {/* 页面完全不在文件内 */
		/*
		 * The page may have dirty, unmapped buffers.  For example,
		 * they may have been added in ext3_writepage().  Make them
		 * freeable here, so the page does not leak.
		 */
		/* 使整个页面失效 */
		block_invalidatepage(page, 0);
		/* 解锁页面并退出 */
		unlock_page(page);
		return 0; /* don't care */
	}

	/*
	 * The page straddles i_size.  It must be zeroed out on each and every
	 * writepage invokation because it may be mmapped.  "A file is mapped
	 * in multiples of the page size.  For a file that is not a multiple of
	 * the  page size, the remaining memory is zeroed when mapped, and
	 * writes to that region are not written out to the file."
	 */
	/* 页面位于文件最后，首先映射页面 */
	kaddr = kmap_atomic(page, KM_USER0);
	/* 将超出部分清0，避免向磁盘写入垃圾数据 */
	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
	flush_dcache_page(page);
	kunmap_atomic(kaddr, KM_USER0);
	/* 将整个页面写入到磁盘 */
	return __block_write_full_page(inode, page, get_block, wbc);
}

sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
			    get_block_t *get_block)
{
	struct buffer_head tmp;
	struct inode *inode = mapping->host;
	tmp.b_state = 0;
	tmp.b_blocknr = 0;
	get_block(inode, block, &tmp, 0);
	return tmp.b_blocknr;
}

/**
 * 当针对BIO上的IO数据传输终止时，内核调用bi_end_io方法
 * 一般来说，bi_end_io是end_bio_bh_io_sync
 */
static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err)
{
	/**
	 * 从bi_private中获得缓冲区首部的地址
	 */
	struct buffer_head *bh = bio->bi_private;

	if (bio->bi_size)
		return 1;

	if (err == -EOPNOTSUPP) {
		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
		set_bit(BH_Eopnotsupp, &bh->b_state);
	}

	/**
	 * 调用缓冲区首部的b_end_io，如end_buffer_async_read
	 */
	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
	/**
	 * 释放bio.
	 */
	bio_put(bio);
	return 0;
}

/**
 * 向内核通用块层传递一个缓冲区首部。并由此请求传输一个数据块。
 * rw:		数据传输方向(读或者写)
 * bh:		要传送数据的块缓冲区首部。
 */
int submit_bh(int rw, struct buffer_head * bh)
{
	struct bio *bio;
	int ret = 0;

	BUG_ON(!buffer_locked(bh));
	BUG_ON(!buffer_mapped(bh));
	BUG_ON(!bh->b_end_io);

	if (buffer_ordered(bh) && (rw == WRITE))
		rw = WRITE_BARRIER;

	/*
	 * Only clear out a write error when rewriting, should this
	 * include WRITE_SYNC as well?
	 */
	/**
	 * test_set_buffer_req设置缓冲区首部的BH_Req标志，以表示块至少被访问过一次。
	 * 如果数据传输方向是WRITE，就将BH_Write_EIO标志清0.
	 */
	if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER))
		clear_buffer_write_io_error(bh);

	/*
	 * from here on down, it's all bio -- do the initial mapping,
	 * submit_bio -> generic_make_request may further map this bio around
	 */
	/**
	 * 分配一个新的BIO描述符。
	 */
	bio = bio_alloc(GFP_NOIO, 1);

	/**
	 * 根据缓冲区首部的内容初始化bio描述符的字段。
	 * 把块中的第一个扇区的号赋给bi_sector。
	 * 把块设备描述符的地址赋给bi_bdev。
	 * 把块大小赋给bi_size。
	 * 初始化bi_io_vec的第一个元素，以使该段对应于块缓冲区。
	 * 把bi_vcnt置为1(只有一个bio的段)，并把bi_idx置为0.
	 * 把end_bio_bh_io_sync的地址赋给bi_end_io字段，并把缓冲区首部的地址赋给bi_private字段。
	 */
	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
	bio->bi_bdev = bh->b_bdev;
	bio->bi_io_vec[0].bv_page = bh->b_page;
	bio->bi_io_vec[0].bv_len = bh->b_size;
	bio->bi_io_vec[0].bv_offset = bh_offset(bh);

	bio->bi_vcnt = 1;
	bio->bi_idx = 0;
	bio->bi_size = bh->b_size;

	bio->bi_end_io = end_bio_bh_io_sync;/* 在bio完成后再释放它 */
	bio->bi_private = bh;

	/**
	 * 递增bio的引用计数。
	 */
	bio_get(bio);
	/**
	 * submit_bio把bi_rw标志设置为数据传输的方向。更新每CPU变量page_states以表示读和写的扇区数。
	 * 并对bio描述符调用generic_make_request函数。
	 */
	submit_bio(rw, bio);

	if (bio_flagged(bio, BIO_EOPNOTSUPP))
		ret = -EOPNOTSUPP;

	/**
	 * 递减bio的使用计数器。因为bio描述符现在已经被插入IO调度程序的队列，所以没有释放bio描述符。
	 */
	bio_put(bio);
	return ret;
}

/**
 * ll_rw_block: low-level access to block devices (DEPRECATED)
 * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
 * @nr: number of &struct buffer_heads in the array
 * @bhs: array of pointers to &struct buffer_head
 *
 * ll_rw_block() takes an array of pointers to &struct buffer_heads,
 * and requests an I/O operation on them, either a %READ or a %WRITE.
 * The third %READA option is described in the documentation for
 * generic_make_request() which ll_rw_block() calls.
 *
 * This function drops any buffer that it cannot get a lock on (with the
 * BH_Lock state bit), any buffer that appears to be clean when doing a
 * write request, and any buffer that appears to be up-to-date when doing
 * read request.  Further it marks as clean buffers that are processed for
 * writing (the buffer cache won't assume that they are actually clean until
 * the buffer gets unlocked).
 *
 * ll_rw_block sets b_end_io to simple completion handler that marks
 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
 * any waiters. 
 *
 * All of the buffers must be for the same device, and must also be a
 * multiple of the current approved size for the device.
 */
/**
 * 进行几个数据块的数据传输，这些数据块不一定物理上相邻。
 * 注意:	在Io完成之前，缓冲区被锁住。
 * rw:		数据传输的方向。
 * nr:		要传输的数据块的块数量。
 * bhs:		指向块缓冲区所对应的缓冲区首部的指针数组。
 */
void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
{
	int i;

	/**
	 * 在所有缓冲区首部上循环。
	 */
	for (i = 0; i < nr; i++) {
		struct buffer_head *bh = bhs[i];

		/**
		 * 检查并设置缓冲区首部的BH_Lock标志。
		 * 如果缓冲区已经被锁住，而另外一个内核控制路径已经激活了数据传输，就不处理这个缓冲区。
		 */
		if (test_set_buffer_locked(bh))
			continue;

		/**
		 * 把缓冲区首部的使用计数器b_count加1.
		 */
		get_bh(bh);
		if (rw == WRITE) {
			/**
			 * 如果数据传输的方向是WRITE，就让缓冲区首部的方法b_end_io指向函数end_buffer_write_sync
			 */
			bh->b_end_io = end_buffer_write_sync;
			/**
			 * 检查并清除缓冲区首部的BH_Dirty标志。
			 * 如果该标志没有置位，就不必把块写入磁盘。
			 */
			if (test_clear_buffer_dirty(bh)) {
				/**
				 * 需要读或者写块，调用submit_bh把缓冲区首部传递到通用块块。
				 */
				submit_bh(WRITE, bh);
				continue;
			}
		} else {
			/**
			 * 如果数据传输的方向不是WRITE，就让缓冲区首部的方法b_end_io指向函数end_buffer_read_sync
			 */
			bh->b_end_io = end_buffer_read_sync;
			/**
			 * 如果数据传输方向不是WRITE，就判断缓冲区首部的BH_Uptodate标志是否被置位，如果是，就不必从磁盘读块。
			 */
			if (!buffer_uptodate(bh)) {
				/**
				 * 需要读或者写块，调用submit_bh把缓冲区首部传递到通用块块。
				 */
				submit_bh(rw, bh);
				continue;
			}
		}
		/**
		 * 通过清除BH_Lock标志为缓冲区首部解锁，然后唤醒所有等待块解锁的进程。
		 */
		unlock_buffer(bh);
		/**
		 * 递减缓冲区首部的b_count字段。
		 */
		put_bh(bh);
	}
}

/*
 * For a data-integrity writeout, we need to wait upon any in-progress I/O
 * and then start new I/O and then wait upon it.  The caller must have a ref on
 * the buffer_head.
 */
int sync_dirty_buffer(struct buffer_head *bh)
{
	int ret = 0;

	WARN_ON(atomic_read(&bh->b_count) < 1);
	lock_buffer(bh);
	if (test_clear_buffer_dirty(bh)) {
		get_bh(bh);
		bh->b_end_io = end_buffer_write_sync;
		ret = submit_bh(WRITE, bh);
		wait_on_buffer(bh);
		if (buffer_eopnotsupp(bh)) {
			clear_buffer_eopnotsupp(bh);
			ret = -EOPNOTSUPP;
		}
		if (!ret && !buffer_uptodate(bh))
			ret = -EIO;
	} else {
		unlock_buffer(bh);
	}
	return ret;
}

/*
 * try_to_free_buffers() checks if all the buffers on this particular page
 * are unused, and releases them if so.
 *
 * Exclusion against try_to_free_buffers may be obtained by either
 * locking the page or by holding its mapping's private_lock.
 *
 * If the page is dirty but all the buffers are clean then we need to
 * be sure to mark the page clean as well.  This is because the page
 * may be against a block device, and a later reattachment of buffers
 * to a dirty page will set *all* buffers dirty.  Which would corrupt
 * filesystem data on the same device.
 *
 * The same applies to regular filesystem pages: if all the buffers are
 * clean then we set the page clean and proceed.  To do that, we require
 * total exclusion from __set_page_dirty_buffers().  That is obtained with
 * private_lock.
 *
 * try_to_free_buffers() is non-blocking.
 */
static inline int buffer_busy(struct buffer_head *bh)
{
	return atomic_read(&bh->b_count) |
		(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
}

static int
drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
{
	struct buffer_head *head = page_buffers(page);
	struct buffer_head *bh;

	bh = head;
	do {
		if (buffer_write_io_error(bh))
			set_bit(AS_EIO, &page->mapping->flags);
		if (buffer_busy(bh))
			goto failed;
		bh = bh->b_this_page;
	} while (bh != head);

	do {
		struct buffer_head *next = bh->b_this_page;

		if (!list_empty(&bh->b_assoc_buffers))
			__remove_assoc_queue(bh);
		bh = next;
	} while (bh != head);
	*buffers_to_free = head;
	__clear_page_buffers(page);
	return 1;
failed:
	return 0;
}

int try_to_free_buffers(struct page *page)
{
	struct address_space * const mapping = page->mapping;
	struct buffer_head *buffers_to_free = NULL;
	int ret = 0;

	BUG_ON(!PageLocked(page));
	/**
	 * 检查页中所有缓冲区的首部标志。如果正在写回，说明不能释放这些缓冲区，返回0.
	 */
	if (PageWriteback(page))
		return 0;

	if (mapping == NULL) {		/* can this still happen? */
		ret = drop_buffers(page, &buffers_to_free);
		goto out;
	}

	spin_lock(&mapping->private_lock);
	/**
	 * 如果缓冲区首部在间接缓冲区的链表中，则从链表中删除它。
	 */
	ret = drop_buffers(page, &buffers_to_free);
	if (ret) {
		/*
		 * If the filesystem writes its buffers by hand (eg ext3)
		 * then we can have clean buffers against a dirty page.  We
		 * clean the page here; otherwise later reattachment of buffers
		 * could encounter a non-uptodate page, which is unresolvable.
		 * This only applies in the rare case where try_to_free_buffers
		 * succeeds but the page is not freed.
		 */
		/** 
		 * 清除页的PG_dirty标志。
		 */
		clear_page_dirty(page);
	}
	spin_unlock(&mapping->private_lock);
out:
	if (buffers_to_free) {
		struct buffer_head *bh = buffers_to_free;

		/**
		 * 反复调用free_buffer_head，以释放页的所有缓冲区首部。
		 */
		do {
			struct buffer_head *next = bh->b_this_page;
			free_buffer_head(bh);
			bh = next;
		} while (bh != buffers_to_free);
	}
	return ret;
}
EXPORT_SYMBOL(try_to_free_buffers);

int block_sync_page(struct page *page)
{
	struct address_space *mapping;

	smp_mb();
	mapping = page_mapping(page);
	if (mapping)
		blk_run_backing_dev(mapping->backing_dev_info, page);
	return 0;
}

/*
 * There are no bdflush tunables left.  But distributions are
 * still running obsolete flush daemons, so we terminate them here.
 *
 * Use of bdflush() is deprecated and will be removed in a future kernel.
 * The `pdflush' kernel threads fully replace bdflush daemons and this call.
 */
asmlinkage long sys_bdflush(int func, long data)
{
	static int msg_count;

	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;

	if (msg_count < 5) {
		msg_count++;
		printk(KERN_INFO
			"warning: process `%s' used the obsolete bdflush"
			" system call\n", current->comm);
		printk(KERN_INFO "Fix your initscripts?\n");
	}

	if (func == 1)
		do_exit(0);
	return 0;
}

/*
 * Buffer-head allocation
 */
/**
 * 缓冲区首部有它自己的slab分配器高速缓存。其描述符kmem_cache_s存放在变量bh_cachep中
 */
static kmem_cache_t *bh_cachep;

/*
 * Once the number of bh's in the machine exceeds this level, we start
 * stripping them in writeback.
 */
static int max_buffer_heads;

int buffer_heads_over_limit;

struct bh_accounting {
	int nr;			/* Number of live bh's */
	int ratelimit;		/* Limit cacheline bouncing */
};

static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};

static void recalc_bh_state(void)
{
	int i;
	int tot = 0;

	if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
		return;
	__get_cpu_var(bh_accounting).ratelimit = 0;
	for_each_cpu(i)
		tot += per_cpu(bh_accounting, i).nr;
	buffer_heads_over_limit = (tot > max_buffer_heads);
}

/**
 * 页高速缓冲区首部有自己的slab分配器：bh_cachep
 * alloc_buffer_head函数用于获取缓冲区首部
 */
struct buffer_head *alloc_buffer_head(int gfp_flags)
{
	struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
	if (ret) {
		preempt_disable();
		__get_cpu_var(bh_accounting).nr++;
		recalc_bh_state();
		preempt_enable();
	}
	return ret;
}
EXPORT_SYMBOL(alloc_buffer_head);

/**
 * 释放缓冲区首部。
 */
void free_buffer_head(struct buffer_head *bh)
{
	BUG_ON(!list_empty(&bh->b_assoc_buffers));
	kmem_cache_free(bh_cachep, bh);
	preempt_disable();
	__get_cpu_var(bh_accounting).nr--;
	recalc_bh_state();
	preempt_enable();
}
EXPORT_SYMBOL(free_buffer_head);

static void
init_buffer_head(void *data, kmem_cache_t *cachep, unsigned long flags)
{
	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
			    SLAB_CTOR_CONSTRUCTOR) {
		struct buffer_head * bh = (struct buffer_head *)data;

		memset(bh, 0, sizeof(*bh));
		INIT_LIST_HEAD(&bh->b_assoc_buffers);
	}
}

#ifdef CONFIG_HOTPLUG_CPU
static void buffer_exit_cpu(int cpu)
{
	int i;
	struct bh_lru *b = &per_cpu(bh_lrus, cpu);

	for (i = 0; i < BH_LRU_SIZE; i++) {
		brelse(b->bhs[i]);
		b->bhs[i] = NULL;
	}
}

static int buffer_cpu_notify(struct notifier_block *self,
			      unsigned long action, void *hcpu)
{
	if (action == CPU_DEAD)
		buffer_exit_cpu((unsigned long)hcpu);
	return NOTIFY_OK;
}
#endif /* CONFIG_HOTPLUG_CPU */

void __init buffer_init(void)
{
	int nrpages;

	bh_cachep = kmem_cache_create("buffer_head",
			sizeof(struct buffer_head), 0,
			SLAB_PANIC, init_buffer_head, NULL);

	/*
	 * Limit the bh occupancy to 10% of ZONE_NORMAL
	 */
	nrpages = (nr_free_buffer_pages() * 10) / 100;
	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
	hotcpu_notifier(buffer_cpu_notify, 0);
}

EXPORT_SYMBOL(__bforget);
EXPORT_SYMBOL(__brelse);
EXPORT_SYMBOL(__wait_on_buffer);
EXPORT_SYMBOL(block_commit_write);
EXPORT_SYMBOL(block_prepare_write);
EXPORT_SYMBOL(block_read_full_page);
EXPORT_SYMBOL(block_sync_page);
EXPORT_SYMBOL(block_truncate_page);
EXPORT_SYMBOL(block_write_full_page);
EXPORT_SYMBOL(cont_prepare_write);
EXPORT_SYMBOL(end_buffer_async_write);
EXPORT_SYMBOL(end_buffer_read_sync);
EXPORT_SYMBOL(end_buffer_write_sync);
EXPORT_SYMBOL(file_fsync);
EXPORT_SYMBOL(fsync_bdev);
EXPORT_SYMBOL(generic_block_bmap);
EXPORT_SYMBOL(generic_commit_write);
EXPORT_SYMBOL(generic_cont_expand);
EXPORT_SYMBOL(init_buffer);
EXPORT_SYMBOL(invalidate_bdev);
EXPORT_SYMBOL(ll_rw_block);
EXPORT_SYMBOL(mark_buffer_dirty);
EXPORT_SYMBOL(submit_bh);
EXPORT_SYMBOL(sync_dirty_buffer);
EXPORT_SYMBOL(unlock_buffer);
